diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/htsearch')
69 files changed, 10353 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/.cvsignore b/debian/htdig/htdig-3.2.0b6/htsearch/.cvsignore new file mode 100644 index 00000000..f4f41320 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/.cvsignore @@ -0,0 +1,8 @@ +Makefile +*.lo +*.la +.purify +.pure +.deps +.libs +htsearch diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.cc new file mode 100644 index 00000000..a1a608e5 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.cc @@ -0,0 +1,150 @@ +// +// AndQuery.cc +// +// AndQuery: an operator query that does 'and' combination +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: AndQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + + +#include "AndQuery.h" +// +// l r and +// ---------------------- +// 0 0 0 +// 0 b 0 +// 0 x 0 +// a 0 0 +// a b intersect(a,b) +// a x a +// x 0 0 +// x b b +// x x x +// +// i.e. some 0 => 0 +// ignores can be left out of intersection +// the shorter of the result lists is put apart for intersection +// this optimises the intersection process +// + +ResultList * +AndQuery::Evaluate() +{ + ResultList *result = 0; + ResultList *shorter = 0; + + operands.Start_Get(); + Query *operand = (Query *) operands.Get_Next(); + while(operand && !shorter) + { + result = operand->GetResults(); + if(!result) + { + break; + } + if(!result->IsIgnore()) + { + shorter = result; + } + operand = (Query *) operands.Get_Next(); + } + if(shorter) + { + List longer; + while(operand && result) + { + result = operand->GetResults(); + if(result && !result->IsIgnore()) + { + if(result->Count() < shorter->Count()) + { + longer.Add(shorter); + shorter = result; + } + else + { + longer.Add(result); + } + } + operand = (Query *) operands.Get_Next(); + } + if(longer.Count()) + { + result = Intersection(*shorter, longer); + longer.Release(); + } + else + { + result = new ResultList(*shorter); + } + } + return result; +} + +// +// return a result list containing only the matches common to +// all input parameters. +// +// l is iterated, matches from l are searched in all elements of rs +// +// +// foreach match in shorter +// confirm the match in each lists +// if confirmed +// copy all matches in result +// +// the shorter of the input lists is assumed to be in the first parameter +// this is a modest optimisation in order to minimise iteration +// + +ResultList * +AndQuery::Intersection(const ResultList &shorter, const List &lists) +{ + ResultList *result = 0; + DictionaryCursor c; + shorter.Start_Get(c); + DocMatch *match = (DocMatch *)shorter.Get_NextElement(c); + while(match) + { + List confirms; + + ListCursor lc; + lists.Start_Get(lc); + ResultList *list = (ResultList *)lists.Get_Next(lc); + while(list) + { + DocMatch *confirm = list->find(match->GetId()); + if(confirm) + { + confirms.Add(confirm); + } + list = (ResultList *)lists.Get_Next(lc); + } + if(confirms.Count() == lists.Count()) + { + if(!result) + { + result = new ResultList; + } + DocMatch *copy = new DocMatch(*match); + confirms.Start_Get(); + DocMatch *confirm = (DocMatch *)confirms.Get_Next(); + while(confirm) + { + copy->Merge(*confirm); + confirm = (DocMatch *)confirms.Get_Next(); + } + result->add(copy); + } + confirms.Release(); + match = (DocMatch *)shorter.Get_NextElement(c); + } + return result; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.h new file mode 100644 index 00000000..f93ccca7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/AndQuery.h @@ -0,0 +1,42 @@ +#ifndef _AndQuery_h_ +#define _AndQuery_h_ + +// +// AndQuery.h +// +// AndQuery: an operator query that does 'and' combination +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: AndQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "OperatorQuery.h" + +// +// and query +// +class AndQuery : public OperatorQuery +{ +public: + +private: + // evaluate operands and intersect results + ResultList *Evaluate(); + + // create an intersection of the operand results + ResultList *Intersection(const ResultList &shorter, const List &longer); + + // used by GetLogicalWords + String OperatorString() const { return String("and"); } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/AndQueryParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/AndQueryParser.h new file mode 100644 index 00000000..a17c80fb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/AndQueryParser.h @@ -0,0 +1,33 @@ +#ifndef _AndQueryParser_h_ +#define _AndQueryParser_h_ + +// +// AndQueryParser.h +// +// AndQueryParser: a simple query parser for 'all words' queries +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: AndQueryParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "SimpleQueryParser.h" +#include "AndQuery.h" + +class AndQueryParser : public SimpleQueryParser +{ +public: + AndQueryParser() {} + +private: + OperatorQuery *MakeQuery() + { + return new AndQuery; + } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.cc b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.cc new file mode 100644 index 00000000..3c965d72 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.cc @@ -0,0 +1,76 @@ +// +// BooleanLexer.cc +// +// BooleanLexer: lexical analyzer for boolean query expressions. +// defines terminal symbols +// "word", and, or, not, near, (, ), / +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BooleanLexer.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "BooleanLexer.h" +bool +BooleanLexer::IsOr() const +{ + return current == String("or"); +} + +bool +BooleanLexer::IsAnd() const +{ + return current == String("and"); +} + +bool +BooleanLexer::IsNot() const +{ + return current == String("not"); +} + +bool +BooleanLexer::IsNear() const +{ + return current == String("near"); +} + +bool +BooleanLexer::IsSlash() const +{ + return current == String("/"); +} + +bool +BooleanLexer::IsLeftParen() const +{ + return current == String("("); +} + + +bool +BooleanLexer::IsRightParen() const +{ + return current == String(")"); +} + +bool +BooleanLexer::IsWord() const +{ + return !IsEnd() + && !IsQuote() + && !IsRightParen() + && !IsLeftParen() + && !IsSlash() + && !IsAnd() + && !IsOr() + && !IsAnd() + && !IsNot() + && !IsNear(); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.h b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.h new file mode 100644 index 00000000..677f9ed3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanLexer.h @@ -0,0 +1,50 @@ +#ifndef _BooleanLexer_h_ +#define _BooleanLexer_h_ + +// +// BooleanLexer.h +// +// BooleanLexer: lexical analyzer for boolean query expressions. +// defines terminal symbols +// "word", and, or, not, near, (, ), / +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BooleanLexer.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryLexer.h" + +class BooleanLexer : public QueryLexer +{ +public: + // is the current token a word? + bool IsWord() const; + + // is the current token the 'and' keyword? + bool IsAnd() const; + + // is the current token the 'or' keyword? + bool IsOr() const; + + // is the current token the 'near' keyword? + bool IsNear() const; + + // is the current token the 'not' keyword? + bool IsNot() const; + + // is the current token the '(' sign? + bool IsLeftParen() const; + + // is the current token the ')' sign? + bool IsRightParen() const; + + // is the current token the '/' sign? + bool IsSlash() const; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.cc new file mode 100644 index 00000000..dc5451a7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.cc @@ -0,0 +1,238 @@ +// +// BooleanQueryParser.cc +// +// BooleanQueryParser: Query parser for full-blown boolean expressions +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BooleanQueryParser.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "BooleanQueryParser.h" + +#include "OrQuery.h" +#include "NotQuery.h" +#include "AndQuery.h" +#include "NearQuery.h" +#include "PhraseQuery.h" +#include "FuzzyExpander.h" + +// +// expr == andlist ( 'or' andlist ) +// +Query * +BooleanQueryParser::ParseExpression() +{ + Query *result = 0; + Query *term = ParseAnd(); + if(term) + { + if(token.IsOr()) + { + result = new OrQuery; + result->Add(term); + while(term && token.IsOr()) + { + token.Next(); + term = ParseAnd(); + if(term) + { + result->Add(term); + } + } + } + else + { + result = term; + } + } + if(!term && result) + { + delete result; + result = 0; + } + return result; +} + +// +// notlist = nearlist { 'not' nearlist } +// +Query * +BooleanQueryParser::ParseNot() +{ + Query *result = 0; + Query *near = ParseNear(); + if(near) + { + if(token.IsNot()) + { + result = new NotQuery(); + result->Add(near); + while(near && token.IsNot()) + { + token.Next(); + near = ParseNear(); + if(near) + { + result->Add(near); + } + } + } + else + { + result = near; + } + } + if(!near && result) + { + delete result; + result = 0; + } + return result; +} + +// +// andlist = notlist { 'and' notlist } +// +Query * +BooleanQueryParser::ParseAnd() +{ + Query *result = 0; + Query *notList = ParseNot(); + + if(notList) + { + if(token.IsAnd()) + { + result = new AndQuery(); + result->Add(notList); + while(notList && token.IsAnd()) + { + token.Next(); + notList = ParseNot(); + if(notList) + { + result->Add(notList); + } + } + } + else + { + result = notList; + } + } + if(!notList && result) + { + delete result; + result = 0; + } + return result; +} + +// +// near == factor { 'near' [ '/' number ] factor } +// 'near' query is binary +// +Query * +BooleanQueryParser::ParseNear() +{ + Query *result = ParseFactor(); + while(result && token.IsNear()) + { + token.Next(); + int distance = 10; // config["default_near_distance"]; + if(token.IsSlash()) + { + distance = 0; + token.Next(); + if(token.IsWord()) + { + distance = token.Value().as_integer(); + token.Next(); + } + } + if(distance > 0) + { + Query *right = ParseFactor(); + if(right) + { + Query *tmp = new NearQuery(distance); + tmp->Add(result); + tmp->Add(right); + result = tmp; + } + else + { + delete result; + result = 0; + } + } + else + { + Expected("a distance > 0 for 'Near'"); + delete result; + result = 0; + } + } + return result; +} + +// +// factor == word | '"' phrase '"' | '(' expression ')' +// +Query * +BooleanQueryParser::ParseFactor() +{ + Query *result = 0; + + if(token.IsWord()) + { + result = ParseWord(); + } + else if(token.IsQuote()) + { + token.Next(); + result = ParsePhrase(); + if(result) + { + if(token.IsQuote()) + { + token.Next(); + } + else + { + Expected("closing \""); + delete result; + result = 0; + } + } + } + else if(token.IsLeftParen()) + { + token.Next(); + result = ParseExpression(); + if(result) + { + if(token.IsRightParen()) + { + token.Next(); + } + else + { + Expected(")"); + delete result; + result = 0; + } + } + } + else + { + Expected("'(', '\"', or a word"); + } + return result; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.h new file mode 100644 index 00000000..d65eaa8f --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/BooleanQueryParser.h @@ -0,0 +1,43 @@ +#ifndef _BooleanQueryParser_h_ +#define _BooleanQueryParser_h_ + +// +// BooleanQueryParser.h +// +// BooleanQueryParser: Query parser for full-blown boolean expressions +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: BooleanQueryParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryParser.h" +#include "BooleanLexer.h" + +class BooleanQueryParser : public QueryParser +{ +public: + BooleanQueryParser() {} + ~BooleanQueryParser() {} + +private: + // recursive parse levels + // returning constructed query trees + Query *ParseExpression(); + Query *ParseAnd(); + Query *ParseNot(); + Query *ParseNear(); + Query *ParseFactor(); + + // lexer access needed by parent class + QueryLexer &Token() { return token; } + + // the lexical analyzer + BooleanLexer token; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Collection.cc b/debian/htdig/htdig-3.2.0b6/htsearch/Collection.cc new file mode 100644 index 00000000..9e40f230 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Collection.cc @@ -0,0 +1,105 @@ +// +// Collection.cc +// +// Collection: Specifies a list of databases to use in the search +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Collection.cc,v 1.7 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htsearch.h" +#include "Collection.h" +#include "ResultMatch.h" +#include "WeightWord.h" +#include "StringMatch.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "HtURLCodec.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <syslog.h> +#endif + +#include <locale.h> + +//***************************************************************************** +// +Collection::Collection(const char *name, const char *word_file, + const char *index_file, const char *doc_file, + const char *doc_excerpt) +{ + collectionName = name; + wordFile = word_file; + indexFile = index_file; + docFile = doc_file; + docExcerpt = doc_excerpt; + matches = NULL; + searchWords = NULL; + searchWordsPattern = NULL; + isopen = 0; +} + +Collection::~Collection() +{ + if(matches) delete matches; + if(searchWords) delete searchWords; + if(searchWordsPattern) delete searchWordsPattern; + Close(); +} + +void +Collection::Open() +{ + if (!isopen) + { + docDB.Read(docFile, indexFile, docExcerpt); + } + isopen = 1; +} + +void +Collection::Close() +{ + if (isopen) + { + docDB.Close(); + } + isopen = 0; +} + +DocumentRef * +Collection::getDocumentRef(int id) +{ + Open(); + return docDB[id]; +} + +int +Collection::ReadExcerpt(DocumentRef &ref) +{ + Open(); + return docDB.ReadExcerpt(ref); +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Collection.h b/debian/htdig/htdig-3.2.0b6/htsearch/Collection.h new file mode 100644 index 00000000..5b61e6bb --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Collection.h @@ -0,0 +1,73 @@ +// +// Collection.h +// +// Collection: Specifies a list of databases to use in the search +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Collection.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// +#ifndef _Collection_h_ +#define _Collection_h_ + +#include "Object.h" +#include "ResultList.h" +#include "ResultMatch.h" +#include "TemplateList.h" +#include "cgi.h" +#include "StringMatch.h" +#include "List.h" +#include "DocumentDB.h" +#include "Database.h" +#include "Dictionary.h" + +class Collection : public Object +{ +public: + // + // Construction/Destruction + // + Collection(const char *name, const char *wordFile, + const char *indexFile, const char *docFile, + const char *docExcerpt); + ~Collection(); + + void Collection::Open(); + + void Collection::Close(); + + char *getWordFile() { return wordFile.get(); } + DocumentRef *getDocumentRef(int id); + ResultList *getResultList() { return matches; } + void setResultList(ResultList *list) { matches = list; } + + List *getSearchWords() { return searchWords; } + void setSearchWords(List *list) { searchWords = list; } + + StringMatch *getSearchWordsPattern() { return searchWordsPattern;} + void setSearchWordsPattern(StringMatch *smatch) + { searchWordsPattern = smatch; } + + int ReadExcerpt(DocumentRef &ref); + +protected: + String collectionName; + String wordFile; + String indexFile; + String docFile; + String docExcerpt; + ResultList *matches; + List *searchWords; + StringMatch *searchWordsPattern; + + DocumentDB docDB; + // Database *docIndex; + + int isopen; +}; + +#endif // _Collection_h_ diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Display.cc b/debian/htdig/htdig-3.2.0b6/htsearch/Display.cc new file mode 100644 index 00000000..f2300137 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Display.cc @@ -0,0 +1,1956 @@ +// +// Display.cc +// +// Display: Takes results of search and fills in the HTML templates +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Display.cc,v 1.122 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htsearch.h" +#include "Display.h" +#include "ResultMatch.h" +#include "WeightWord.h" +#include "StringMatch.h" +#include "QuotedStringList.h" +#include "URL.h" +#include "HtSGMLCodec.h" +#include "HtURLCodec.h" +#include "HtURLRewriter.h" +#include "WordType.h" +#include "Collection.h" +#include "HtURLSeedScore.h" +//#include "HtURLRewriter.h" +#include "SplitMatches.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#include <stdio.h> +#include <stdlib.h> // for abs +#include <ctype.h> + +#ifndef _MSC_VER /* _WIN32 */ +#include <syslog.h> +#endif + +#include <locale.h> + + +#include <math.h> +#include <float.h> + +#if !defined(DBL_MAX) +# if defined (MAXDOUBLE) +# define DBL_MAX MAXDOUBLE +# elif defined(HUGE_VAL) +# define DBL_MAX HUGE_VAL +# elif defined(MAXFLOAT) +# define DBL_MAX MAXFLOAT +# else +# define DBL_MAX 1e37 +# endif +#endif + +//***************************************************************************** +// +Display::Display(Dictionary *collections) +{ + HtConfiguration* config= HtConfiguration::config(); + selected_collections = collections; + limitTo = 0; + excludeFrom = 0; + // needExcerpt = 0; + templateError = 0; + + maxStars = config->Value("max_stars"); + maxScore = -DBL_MAX; + minScore = DBL_MAX; + setupImages(); + setupTemplates(); + + if (!templates.createFromString(config->Find("template_map"))) + { + // Error in createFromString. + // Let's try the default template_map + + config->Add("template_map", + "Long builtin-long builtin-long Short builtin-short builtin-short"); + if (!templates.createFromString(config->Find("template_map"))) + { + // Unrecoverable Error + // (No idea why this would happen) + templateError = 1; + } + } + + currentTemplate = templates.get(config->Find("template_name")); + if (!currentTemplate) + { + // + // Must have been some error. Resort to the builtin-long (slot 0) + // + currentTemplate = (Template *) templates.templates[0]; + } + if (!currentTemplate) + { + // + // Another error!? Time to bail out... + // + templateError = 1; + } + // if (mystrcasestr(currentTemplate->getMatchTemplate(), "excerpt")) + // needExcerpt = 1; +} + +//***************************************************************************** +Display::~Display() +{ + // docDB.Close(); +} + +//***************************************************************************** +// +void +Display::display(int pageNumber) +{ + HtConfiguration* config= HtConfiguration::config(); + int good_sort = 0; + good_sort = ResultMatch::setSortType(config->Find("sort")); + if (!good_sort) + { + // Must temporarily stash the message in a String, since + // displaySyntaxError will overwrite the static temp used in form. + + String s(form("No such sort method: `%s'", (const char*)config->Find("sort"))); + + displaySyntaxError(s); + return; + } + + List *matches = buildMatchList(); + int currentMatch = 0; + int numberDisplayed = 0; + ResultMatch *match = 0; + int number = 0; + number = config->Value("matches_per_page"); + if (number <= 0) + number = 10; + int startAt = (pageNumber - 1) * number; + + if (config->Boolean("logging")) + { + logSearch(pageNumber, matches); + } + + displayHTTPheaders(); + setVariables(pageNumber, matches); + + // + // The first match is guaranteed to have the highest score of + // all the matches. We use this to compute the number of stars + // to display for all the other matches. + // + match = (ResultMatch *) (*matches)[0]; + if (!match) + { + // + // No matches. + // + delete matches; +// if( config->Boolean("nph") ) cout << "HTTP/1.0 200 OK\r\n"; +// cout << "Content-type: text/html\r\n\r\n"; + displayNomatch(); + return; + } + // maxScore = match->getScore(); // now done in buildMatchList() + +// if( config->Boolean("nph") ) cout << "HTTP/1.0 200 OK\r\n"; +// cout << "Content-type: text/html\r\n\r\n"; + String wrap_file = config->Find("search_results_wrapper"); + String *wrapper = 0; + char *header = 0, *footer = 0; + if (wrap_file.length()) + { + wrapper = readFile(wrap_file.get()); + if (wrapper && wrapper->length()) + { + char wrap_sepr[] = "HTSEARCH_RESULTS"; + char *h = wrapper->get(); + char *p = strstr(h, wrap_sepr); + if (p) + { + if (p > h && p[-1] == '$') + { + footer = p + strlen(wrap_sepr); + header = h; + p[-1] = '\0'; + } + else if (p > h+1 && p[-2] == '$' && + (p[-1] == '(' || p[-1] == '{') && + (p[strlen(wrap_sepr)] == ')' || + p[strlen(wrap_sepr)] == '}')) + { + footer = p + strlen(wrap_sepr) + 1; + header = h; + p[-2] = '\0'; + } + } + } + } + if (header) + expandVariables(header); + else + displayHeader(); + + // + // Display the window of matches requested. + // + if (!currentTemplate->getStartTemplate().empty()) + { + expandVariables(currentTemplate->getStartTemplate()); + } + + matches->Start_Get(); + while ((match = (ResultMatch *)matches->Get_Next()) && + numberDisplayed < number) + { + if (currentMatch >= startAt) + { + // DocumentRef *ref = docDB[match->getID()]; + Collection *collection = match->getCollection(); + DocumentRef *ref = collection->getDocumentRef(match->getID()); + if (!ref || ref->DocState() != Reference_normal) + continue; // The document isn't present or shouldn't be displayed + ref->DocAnchor(match->getAnchor()); + ref->DocScore(match->getScore()); + displayMatch(match, ref, currentMatch+1); + numberDisplayed++; + delete ref; + } + currentMatch++; + } + + if (!currentTemplate->getEndTemplate().empty()) + { + expandVariables(currentTemplate->getEndTemplate()); + } + if (footer) + expandVariables(footer); + else + displayFooter(); + + if (wrapper) + delete wrapper; + delete matches; +} + +//***************************************************************************** +// Return true if the specified URL should be counted towards the results. +int +Display::includeURL(const String& url) +{ + + if (limitTo && limitTo->match(url, 1, 0) == 0) + return 0; + else + { + + if (excludeFrom && excludeFrom->match(url, 0, 0) != 0) + return 0; + else + return 1; + } +} + +//***************************************************************************** +void +Display::displayMatch(ResultMatch *match, DocumentRef *ref, int current) +{ + HtConfiguration* config= HtConfiguration::config(); + String *str = 0; + + char *coded_url = ref->DocURL(); + String url = HtURLCodec::instance()->decode(coded_url); + HtURLRewriter::instance()->replace(url); + ref->DocURL(url.get()); // for star_patterns & template_patterns match + vars.Add("URL", new String(url.get())); + + vars.Remove("ANCHOR"); // get rid of any previous setting + int iA = ref->DocAnchor(); + + String *anchor = 0; + int fanchor = 0; + if (iA > 0) // if an anchor was found + { + List *anchors = ref->DocAnchors(); + if (anchors->Count() >= iA) + { + anchor = new String(); + fanchor = 1; + *anchor << "#" << ((String*) (*anchors)[iA-1])->get(); + vars.Add("ANCHOR", anchor); + } + } + + // + // no condition for determining excerpt any more: + // we need it anyway to see if an anchor is relevant + // + int first = -1; + String urlanchor(url); + if (anchor) + urlanchor << anchor; + vars.Add("EXCERPT", excerpt(match, ref, urlanchor, fanchor, first)); + // + // anchor only relevant if an excerpt was found, i.e., + // the search expression matches the body of the document + // instead of only META keywords. + // + if (first < 0) + { + vars.Remove("ANCHOR"); + } + + vars.Add("METADESCRIPTION", new String(ref->DocMetaDsc())); + vars.Add("SCORE", new String(form("%f", ref->DocScore()))); + vars.Add("CURRENT", new String(form("%d", current))); + char *title = ref->DocTitle(); + if (!title || !*title) + { + if ( strcmp(config->Find("no_title_text"), "filename") == 0 ) + { + // use actual file name + title = strrchr(url.get(), '/'); + if (title) + { + title++; // Skip slash + str = new String(form("[%s]", title)); + decodeURL(*str); // convert %20 to space, etc + } + else + // URL without '/' ?? + str = new String("[No title]"); + } + else + // use configure 'no title' text + str = new String(config->Find("no_title_text")); + } + else + str = new String(title); + vars.Add("TITLE", str); + vars.Add("STARSRIGHT", generateStars(ref, 1)); + vars.Add("STARSLEFT", generateStars(ref, 0)); + vars.Add("SIZE", new String(form("%d", ref->DocSize()))); + vars.Add("SIZEK", new String(form("%d", + (ref->DocSize() + 1023) / 1024))); + + if (maxScore != 0 && maxScore != minScore) + { + int percent = (int)((ref->DocScore() - minScore) * 100 / + (maxScore - minScore)); + if (percent <= 0) + percent = 1; + vars.Add("PERCENT", new String(form("%d", percent))); + } + else + vars.Add("PERCENT", new String("100")); + + { + str = new String(); + char buffer[100]; + time_t t = ref->DocTime(); + if (t) + { + struct tm *tm = localtime(&t); + String datefmt = config->Find("date_format"); + const String locale = config->Find("locale"); + if (datefmt.empty()) + { + if (config->Boolean("iso_8601")) + datefmt = "%Y-%m-%d %H:%M:%S %Z"; + else + datefmt = "%x"; + } + if (!locale.empty()) + { + setlocale(LC_TIME,locale); + } + strftime(buffer, sizeof(buffer), (char*)datefmt, tm); + *str << buffer; + } + vars.Add("MODIFIED", str); + } + + vars.Add("HOPCOUNT", new String(form("%d", ref->DocHopCount()))); + vars.Add("DOCID", new String(form("%d", ref->DocID()))); + vars.Add("BACKLINKS", new String(form("%d", ref->DocBackLinks()))); + + { + str = new String(); + List *list = ref->Descriptions(); + int n = list->Count(); + for (int i = 0; i < n; i++) + { + *str << ((String*) (*list)[i])->get() << "<br>"; + } + vars.Add("DESCRIPTIONS", str); + String *description = new String(); + if (list->Count()) + *description << ((String*) (*list)[0]); + vars.Add("DESCRIPTION", description); + } + + int index = 0; + int length = 0; + int status = -1; + if (URLtemplate.hasPattern()) + status = URLtemplate.FindFirst(ref->DocURL(), index, length); + if (status >= 0 && index >= 0) + displayParsedFile( ((String*) URLtemplateList[index])->get() ); + else + expandVariables(currentTemplate->getMatchTemplate()); +} + +//***************************************************************************** +void +Display::setVariables(int pageNumber, List *matches) +{ + HtConfiguration* config= HtConfiguration::config(); + String tmp; + int i; + int nMatches = 0; + + if (matches) + nMatches = matches->Count(); + + int matchesPerPage = config->Value("matches_per_page"); + if (matchesPerPage <= 0) + matchesPerPage = 10; + int nPages = (nMatches + matchesPerPage - 1) / matchesPerPage; + + if (nPages > config->Value("maximum_pages", 10)) + nPages = config->Value("maximum_pages", 10); + if (nPages < 1) + nPages = 1; // We always have at least one page... + vars.Add("MATCHES_PER_PAGE", new String(config->Find("matches_per_page"))); + vars.Add("MAX_STARS", new String(config->Find("max_stars"))); + vars.Add("CONFIG", new String(config->Find("config"))); + vars.Add("VERSION", new String(config->Find("version"))); + vars.Add("RESTRICT", new String(config->Find("restrict"))); + vars.Add("EXCLUDE", new String(config->Find("exclude"))); + vars.Add("KEYWORDS", new String(config->Find("keywords"))); + vars.Add("MATCHES", new String(form("%d", nMatches))); + vars.Add("PLURAL_MATCHES", new String((nMatches == 1) ? (char *)"" : (const char *) config->Find("plural_suffix"))); + vars.Add("PAGE", new String(form("%d", pageNumber))); + vars.Add("PAGES", new String(form("%d", nPages))); + vars.Add("FIRSTDISPLAYED", + new String(form("%d", (pageNumber - 1) * + matchesPerPage + 1))); + if (nPages > 1) + vars.Add("PAGEHEADER", new String(config->Find("page_list_header"))); + else + vars.Add("PAGEHEADER", new String(config->Find("no_page_list_header"))); + + i = pageNumber * matchesPerPage; + if (i > nMatches) + i = nMatches; + vars.Add("LASTDISPLAYED", new String(form("%d", i))); + + if (config->Find("script_name").length() != 0) { + vars.Add("CGI", new String(config->Find("script_name"))); + } else { + vars.Add("CGI", new String(getenv("SCRIPT_NAME"))); + } + vars.Add("STARTYEAR", new String(config->Find("startyear"))); + vars.Add("STARTMONTH", new String(config->Find("startmonth"))); + vars.Add("STARTDAY", new String(config->Find("startday"))); + vars.Add("ENDYEAR", new String(config->Find("endyear"))); + vars.Add("ENDMONTH", new String(config->Find("endmonth"))); + vars.Add("ENDDAY", new String(config->Find("endday"))); + + String *str; + char *format = input->get("format"); + String *in; + + vars.Add("SELECTED_FORMAT", new String(format)); + + str = new String(); + *str << "<select name=\"format\">\n"; + for (i = 0; i < templates.displayNames.Count(); i++) + { + in = (String *) templates.internalNames[i]; + *str << "<option value=\"" << in->get() << '"'; + if (format && mystrcasecmp(in->get(), format) == 0) + { + *str << " selected"; + } + *str << '>' << ((String*)templates.displayNames[i])->get() << '\n'; + } + *str << "</select>\n"; + vars.Add("FORMAT", str); + + str = new String(); + tmp = config->Find("match_method"); + vars.Add("SELECTED_METHOD", new String(tmp)); + QuotedStringList ml(config->Find("method_names"), " \t\r\n"); + *str << "<select name=\"method\">\n"; + for (i = 0; i < ml.Count(); i += 2) + { + *str << "<option value=\"" << ml[i] << '"'; + if (mystrcasecmp(ml[i], tmp) == 0) + { + *str << " selected"; + vars.Add("MATCH_MESSAGE", new String(ml[i+1])); + } + *str << '>' << ml[i + 1] << '\n'; + } + *str << "</select>\n"; + vars.Add("METHOD", str); + + ////////////////// Multiple database support ////////////////////// + // Emit collection table. Ensure that previously selected collections + // are "checked". + // Collections are specified in the config file with the + // "collection_names" attribute. An example of the corresponding snippet + // in the config file is as follows: + // + // collection_names: htdig_docs htdig_bugs + // + // htdig_bugs and htdig_docs are the two collections (databases) and + // their corresponding config files are: $CONFIG_DIR/htdig_bugs.conf and + // $CONFIG_DIR/htdig_docs.conf respectively. + // + QuotedStringList clist(config->Find("collection_names"), " \t\r\n"); + for (i =0; i < clist.Count(); i++) + { + String config_name = clist[i]; + + for (int j=0; j < collectionList.Count(); j++) + { + if (strcmp(config_name.get(), collectionList[j]) == 0) + { + str = new String(); + *str << "checked"; + String collection_id = "COLLECTION_"; + collection_id << config_name; + vars.Add(collection_id, str); + break; + } + } + } + + ////////////////// Multiple database support ////////////////////// + + str = new String(); + QuotedStringList sl(config->Find("sort_names"), " \t\r\n"); + const String st = config->Find("sort"); + StringMatch datetime; + datetime.IgnoreCase(); + datetime.Pattern("date|time"); + *str << "<select name=\"sort\">\n"; + for (i = 0; i < sl.Count(); i += 2) + { + *str << "<option value=\"" << sl[i] << '"'; + if (mystrcasecmp(sl[i], st) == 0 || + datetime.Compare(sl[i]) && datetime.Compare(st) || + mystrncasecmp(sl[i], st, 3) == 0 && + datetime.Compare(sl[i]+3) && datetime.Compare(st.get()+3)) + *str << " selected"; + *str << '>' << sl[i + 1] << '\n'; + } + *str << "</select>\n"; + vars.Add("SORT", str); + vars.Add("SELECTED_SORT", new String(st)); + + // Handle user-defined select lists. + // Uses octuples containing these values: + // <tempvar> <inparm> <namelistattr> <ntuple> <ivalue> <ilabel> + // <defattr> <deflabel> + // e.g.: + // METHOD_LIST method method_names 2 1 2 match_method "" + // FORMAT_LIST format template_map 3 2 1 template_name "" + // EXCLUDE_LIST exclude exclude_names 2 1 2 exclude "" + // MATCH_LIST matchesperpage matches_per_page_list 1 1 1 + // matches_per_page "Previous Amount" + QuotedStringList builds(config->Find("build_select_lists"), " \t\r\n"); + for (int b = 0; b <= builds.Count()-8; b += 8) + { + int ntuple = atoi(builds[b+3]); + int ivalue = atoi(builds[b+4]); + int ilabel = atoi(builds[b+5]); + int nsel = 0; + int mult = 0, asinput = 0; + const char *cp; + char sepc = '\001'; + String currval; + String pre, post; + QuotedStringList nameopt(builds[b], ",", 1); + QuotedStringList namelist(config->Find(builds[b+2]), " \t\r\n"); + if (ntuple > 0 && ivalue > 0 && ivalue <= ntuple + && ilabel > 0 && ilabel <= ntuple && namelist.Count() % ntuple == 0 + && nameopt.Count() > 0) + { + if (strcmp(builds[b+1], "restrict") == 0 + || strcmp(builds[b+1], "exclude") == 0) + sepc = '|'; + if (nameopt.Count() == 1) + ; // default is single select + else if (mystrcasecmp(nameopt[1], "multiple") == 0) + mult = 1; + else if (mystrcasecmp(nameopt[1], "radio") == 0) + asinput = 1; + else if (mystrcasecmp(nameopt[1], "checkbox") == 0) + { + mult = 1; + asinput = 1; + } + if (nameopt.Count() > 2) + pre = nameopt[2]; + else + pre = ""; + if (nameopt.Count() > 3) + post = nameopt[3]; + else + post = ""; + + str = new String(); + if (!asinput) + { + *str << "<select "; + if (mult) + *str << "multiple "; + *str << "name=\"" << builds[b+1] << "\">\n"; + } + for (i = 0; i < namelist.Count(); i += ntuple) + { + if (*builds[b+6]) + currval = config->Find(builds[b+6]); + else if (input->exists(builds[b+1])) + currval = input->get(builds[b+1]); + else + currval = 0; + if (!asinput) + *str << pre << "<option value=\"" << namelist[i+ivalue-1] << '"'; + else if (mult) + *str << pre << "<input type=\"checkbox\" name=\"" << builds[b+1] + << "\" value=\"" << namelist[i+ivalue-1] << '"'; + else + *str << pre << "<input type=\"radio\" name=\"" << builds[b+1] + << "\" value=\"" << namelist[i+ivalue-1] << '"'; + if (!mult + && mystrcasecmp(namelist[i+ivalue-1], currval.get()) == 0 + || mult && + (cp = mystrcasestr(currval.get(), namelist[i+ivalue-1])) != NULL + && (cp == currval.get() || cp[-1] == '\001' || cp[-1] == sepc) + && (*(cp += strlen(namelist[i+ivalue-1])) == '\0' + || *cp == '\001' || *cp == sepc)) + { + if (!asinput) + *str << " selected"; + else + *str << " checked"; + ++nsel; + } + *str << '>' << namelist[i+ilabel-1] << post << '\n'; + } + if (!nsel && builds[b+7][0] && input->exists(builds[b+1])) + { + if (!asinput) + *str << pre << "<option value=\"" << input->get(builds[b+1]) + << "\" selected>" << builds[b+7] << post << '\n'; + else if (mult) + *str << pre << "<input type=\"checkbox\" name=\"" << builds[b+1] + << "\" value=\"" << input->get(builds[b+1]) + << "\" checked>" << builds[b+7] << post << '\n'; + else + *str << pre << "<input type=\"radio\" name=\"" << builds[b+1] + << "\" value=\"" << input->get(builds[b+1]) + << "\" checked>" << builds[b+7] << post << '\n'; + } + if (!asinput) + *str << "</select>\n"; + vars.Add(nameopt[0], str); + } + } + + // + // If a paged output is required, set the appropriate variables + // + if (nPages > 1) + { + if (pageNumber > 1) + { + str = new String("<a href=\""); + tmp = 0; + createURL(tmp, pageNumber - 1); + *str << tmp << "\">" << config->Find("prev_page_text") << "</a>"; + } + else + { + str = new String(config->Find("no_prev_page_text")); + } + vars.Add("PREVPAGE", str); + + if (pageNumber < nPages) + { + str = new String("<a href=\""); + tmp = 0; + createURL(tmp, pageNumber + 1); + *str << tmp << "\">" << config->Find("next_page_text") << "</a>"; + } + else + { + str = new String(config->Find("no_next_page_text")); + } + vars.Add("NEXTPAGE", str); + + str = new String(); + char *p; + QuotedStringList pnt(config->Find("page_number_text"), " \t\r\n"); + QuotedStringList npnt(config->Find("no_page_number_text"), " \t\r\n"); + QuotedStringList sep(config->Find("page_number_separator"), " \t\r\n"); + if (nPages > config->Value("maximum_page_buttons", 10)) + nPages = config->Value("maximum_page_buttons", 10); + for (i = 1; i <= nPages; i++) + { + if (i == pageNumber) + { + p = npnt[i - 1]; + if (!p) + p = form("%d", i); + *str << p; + } + else + { + p = pnt[i - 1]; + if (!p) + p = form("%d", i); + *str << "<a href=\""; + tmp = 0; + createURL(tmp, i); + *str << tmp << "\">" << p << "</a>"; + } + if (i != nPages && sep.Count() > 0) + *str << sep[(i-1)%sep.Count()]; + else if (i != nPages && sep.Count() <= 0) + *str << " "; + } + vars.Add("PAGELIST", str); + } + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + String* key; + for (i= 0; i < form_vars.Count(); i++) + { + if(!config->Find(form_vars[i]).empty()) + { + key= new String(form_vars[i]); + key->uppercase(); + vars.Add(key->get(), new String(config->Find(form_vars[i]))); + } + } +} + +//***************************************************************************** +void +Display::createURL(String &url, int pageNumber) +{ + HtConfiguration* config= HtConfiguration::config(); + String s; + int i; +#define encodeInput(name) (s = input->get(name), encodeURL(s), s.get()) + + if (!config->Find("script_name").empty()) { + url << config->Find("script_name"); + } else { + url << getenv("SCRIPT_NAME"); + } + + url << '?'; + + if (input->exists("restrict")) + url << "restrict=" << encodeInput("restrict") << ';'; + if (input->exists("exclude")) + url << "exclude=" << encodeInput("exclude") << ';'; + // Not needed: The next loop below handles this output + //if (input->exists("config")) + // url << "config=" << encodeInput("config") << ';'; + + // Put out all specified collections. If none selected, resort to + // default behaviour. + char *config_name = collectionList[0]; + String config_encoded; + if (config_name && config_name[0] == '\0') + config_name = NULL; + + if (config_name) + { + for (i = 0; i < collectionList.Count(); i++) + { + config_name = collectionList[i]; + config_encoded = config_name; + encodeURL(config_encoded); + url << "config=" << config_encoded << ';'; + } + } + + if (input->exists("method")) + url << "method=" << encodeInput("method") << ';'; + if (input->exists("format")) + url << "format=" << encodeInput("format") << ';'; + if (input->exists("sort")) + url << "sort=" << encodeInput("sort") << ';'; + if (input->exists("matchesperpage")) + url << "matchesperpage=" << encodeInput("matchesperpage") << ';'; + if (input->exists("keywords")) + url << "keywords=" << encodeInput("keywords") << ';'; + if (input->exists("words")) + url << "words=" << encodeInput("words") << ';'; + if (input->exists("startyear")) + url << "startyear=" << encodeInput("startyear") << ';'; + if (input->exists("startmonth")) + url << "startmonth=" << encodeInput("startmonth") << ';'; + if (input->exists("startday")) + url << "startday=" << encodeInput("startday") << ';'; + if (input->exists("endyear")) + url << "endyear=" << encodeInput("endyear") << ';'; + if (input->exists("endmonth")) + url << "endmonth=" << encodeInput("endmonth") << ';'; + if (input->exists("endday")) + url << "endday=" << encodeInput("endday") << ';'; + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + for (i= 0; i < form_vars.Count(); i++) + { + if (input->exists(form_vars[i])) + { + s = form_vars[i]; + encodeURL(s); // shouldn't be needed, but just in case + url << s << '='; + url << encodeInput(form_vars[i]) << ';'; + } + } + url << "page=" << pageNumber; +} + +//***************************************************************************** +void +Display::displayHTTPheaders() +{ + HtConfiguration* config= HtConfiguration::config(); + String content_type = config->Find("search_results_contenttype"); + if (config->Boolean("nph")) + cout << "HTTP/1.0 200 OK\r\n"; + if (content_type.length()) + cout << "Content-type: " << content_type << "\r\n\r\n"; +} + +//***************************************************************************** +void +Display::displayHeader() +{ + HtConfiguration* config= HtConfiguration::config(); + displayParsedFile(config->Find("search_results_header")); +} + +//***************************************************************************** +void +Display::displayFooter() +{ + HtConfiguration* config= HtConfiguration::config(); + displayParsedFile(config->Find("search_results_footer")); +} + +//***************************************************************************** +void +Display::displayNomatch() +{ + HtConfiguration* config= HtConfiguration::config(); + displayParsedFile(config->Find("nothing_found_file")); +} + +//***************************************************************************** +void +Display::displaySyntaxError(const String& message) +{ + HtConfiguration* config= HtConfiguration::config(); + displayHTTPheaders(); + setVariables(0, 0); + vars.Add("SYNTAXERROR", new String(message)); + displayParsedFile(config->Find("syntax_error_file")); +} + +//***************************************************************************** +void +Display::displayParsedFile(const String& filename) +{ + FILE *fl = fopen(filename, "r"); + char buffer[1000]; + + while (fl && fgets(buffer, sizeof(buffer), fl)) + { + expandVariables(buffer); + } + if (fl) + fclose(fl); + else if (debug) + cerr << "displayParsedFile: Can't open " << filename << endl; +} + +//***************************************************************************** +// If the result templates need to depend on the URL of the match, we need +// an efficient way to determine which template file to use. To do this, we +// will build a StringMatch object with all the URL patterns and also +// a List parallel to that pattern that contains the actual template file +// names to use for each URL. +// +void +Display::setupTemplates() +{ + HtConfiguration* config= HtConfiguration::config(); + String templatePatterns = config->Find("template_patterns"); + if (!templatePatterns.empty()) + { + // + // The templatePatterns string will have pairs of values. The first + // value of a pair will be a pattern, the second value will be a + // result template file name. + // + char *token = strtok(templatePatterns, " \t\r\n"); + String pattern; + while (token) + { + // + // First token is a pattern... + // + pattern << token << '|'; + + // + // Second token is an URL + // + token = strtok(0, " \t\r\n"); + URLtemplateList.Add(new String(token)); + if (token) + token = strtok(0, " \t\r\n"); + } + pattern.chop(1); + URLtemplate.Pattern(pattern); + } +} + +//***************************************************************************** +// If the star images need to depend on the URL of the match, we need +// an efficient way to determine which image to use. To do this, we +// will build a StringMatch object with all the URL patterns and also +// a List parallel to that pattern that contains the actual images to +// use for each URL. +// +void +Display::setupImages() +{ + HtConfiguration* config= HtConfiguration::config(); + String starPatterns = config->Find("star_patterns"); + if (!starPatterns.empty()) + { + // + // The starPatterns string will have pairs of values. The first + // value of a pair will be a pattern, the second value will be an + // URL to an image. + // + char *token = strtok(starPatterns, " \t\r\n"); + String pattern; + while (token) + { + // + // First token is a pattern... + // + pattern << token << '|'; + + // + // Second token is an URL + // + token = strtok(0, " \t\r\n"); + URLimageList.Add(new String(token)); + if (token) + token = strtok(0, " \t\r\n"); + } + pattern.chop(1); + URLimage.Pattern(pattern); + } +} + +//***************************************************************************** +String * +Display::generateStars(DocumentRef *ref, int right) +{ + int i; + String *result = new String(); + HtConfiguration* config= HtConfiguration::config(); + if (!config->Boolean("use_star_image", 1)) + return result; + + String image = config->Find("star_image"); + const String blank = config->Find("star_blank"); + double score; + + if (maxScore != 0 && maxScore != minScore) + { + score = (ref->DocScore() - minScore) / (maxScore - minScore); + if(debug) cerr << "generateStars: doc, min, max " << ref->DocScore() << ", " << minScore << ", " << maxScore <<endl; + } + else + { + maxScore = ref->DocScore(); + score = 1; + } + int nStars = int(score * (maxStars - 1) + 0.5) + 1; + + vars.Add("NSTARS", new String(form("%.d", nStars))); + if(debug) cerr << "generateStars: nStars " << nStars << " of " << maxStars <<endl; + + if (right) + { + for (i = 0; i < maxStars - nStars; i++) + { + *result << "<img src=\"" << blank << "\" alt=\" \">"; + } + } + + int match = 0; + int length = 0; + int status; + + if (URLimage.hasPattern()) + status = URLimage.FindFirst(ref->DocURL(), match, length); + else + status = -1; + + if (status >= 0 && match >= 0) + { + image = ((String*) URLimageList[match])->get(); + } + + for (i = 0; i < nStars; i++) + { + *result << "<img src=\"" << image << "\" alt=\"*\">"; + } + + if (!right) + { + for (i = 0; i < maxStars - nStars; i++) + { + *result << "<img src=\"" << blank << "\" alt=\" \">"; + } + } + + return result; +} + +//***************************************************************************** +String * +Display::readFile(const String& filename) +{ + FILE *fl; + String *s = new String(); + char line[1024]; + + fl = fopen(filename, "r"); + while (fl && fgets(line, sizeof(line), fl)) + { + *s << line; + } + if (fl) + fclose(fl); + else if (debug) + cerr << "readFile: Can't open " << filename << endl; + return s; +} + +//***************************************************************************** +void +Display::expandVariables(const String& str_arg) +{ + const char* str = str_arg; + enum + { + StStart, StLiteral, StVarStart, StVarClose, StVarPlain, StGotVar + } state = StStart; + String var = ""; + + while (str && *str) + { + switch (state) + { + case StStart: + if (*str == '\\') + state = StLiteral; + else if (*str == '$') + state = StVarStart; + else + cout << *str; + break; + case StLiteral: + cout << *str; + state = StStart; + break; + case StVarStart: + if (*str == '%' || *str == '=') + var << *str; // code for URL-encoded/decoded variable + else if (*str == '&') + { + var << *str; // code for SGML-encoded variable + if (mystrncasecmp("&", str, 5) == 0) + str += 4; + } + else if (*str == '(' || *str == '{') + state = StVarClose; + else if (isalnum(*str) || *str == '_' || *str == '-') + { + var << *str; + state = StVarPlain; + } + else + state = StStart; + break; + case StVarClose: + if (*str == ')' || *str == '}') + state = StGotVar; + else if (isalnum(*str) || *str == '_' || *str == '-') + var << *str; + else + state = StStart; + break; + case StVarPlain: + if (isalnum(*str) || *str == '_' || *str == '-') + var << *str; + else + { + state = StGotVar; + continue; + } + break; + case StGotVar: + // + // We have a complete variable in var. Look it up and + // see if we can find a good replacement for it. + // + outputVariable(var); + var = ""; + state = StStart; + continue; + } + str++; + } + if (state == StGotVar || state == StVarPlain) + { + // + // The end of string was reached, but we are still trying to + // put a variable together. Since we now have a complete + // variable, we will look up the value for it. + // + outputVariable(var); + } +} + +//***************************************************************************** +void +Display::outputVariable(const String& var) +{ + String *temp; + String value = ""; + const char *ev, *name; + + // We have a complete variable name in var. Look it up and + // see if we can find a good replacement for it, either in our + // vars dictionary or in the environment variables. + name = var; + while (*name == '&' || *name == '%' || *name == '=') + name++; + temp = (String *) vars[name]; + if (temp) + value = *temp; + else + { + ev = getenv(name); + if (ev) + value = ev; + } + while (--name >= var.get() && value.length()) + { + if (*name == '%') + encodeURL(value); + else if(*name == '&') + value = HtSGMLCodec::instance()->decode(value); + else // (*name == '=') + decodeURL(value); + } + cout << value; +} + +//***************************************************************************** +List * +Display::buildMatchList() +{ + HtConfiguration* config= HtConfiguration::config(); + char *cpid; + String url; + ResultMatch *thisMatch; + SplitMatches matches(*config); + double backlink_factor = config->Double("backlink_factor"); + double date_factor = config->Double("date_factor"); + double backlink_score = 0; + double date_score = 0; + double base_score = 0; + + + // Additions made here by Mike Grommet ... + + tm startdate; // structure to hold the startdate specified by the user + tm enddate; // structure to hold the enddate specified by the user + time_t now = time((time_t *)0); // fill in all fields for mktime + tm *lt = localtime(&now); // - Gilles's fix + startdate = *lt; + enddate = *lt; + + time_t eternity = ~(1<<(sizeof(time_t)*8-1)); // will be the largest value holdable by a time_t + tm endoftime; // the time_t eternity will be converted into a tm, held by this variable + + time_t timet_startdate; + time_t timet_enddate; + int monthdays[] = {31,28,31,30,31,30,31,31,30,31,30,31}; + + // boolean to test to see if we need to build date information or not + int dategiven = ((config->Value("startmonth")) || + (config->Value("startday")) || + (config->Value("startyear")) || + (config->Value("endmonth")) || + (config->Value("endday")) || + (config->Value("endyear"))); + + // find the end of time + lt = gmtime(&eternity); + endoftime = *lt; + + if(dategiven) // user specified some sort of date information + { + int reldate = ((config->Value("startmonth") < 0) || + (config->Value("startday") < 0) || + (config->Value("startyear") < 0)); + int t; + + // set up the startdate structure + // see man mktime for details on the tm structure + startdate.tm_sec = 0; + startdate.tm_min = 0; + startdate.tm_hour = 0; + startdate.tm_yday = 0; + startdate.tm_wday = 0; + + // The concept here is that if a user did not specify a part of a date, + // then we will make assumtions... + // For instance, suppose the user specified Feb, 1999 as the start + // range, we take steps to make sure that the search range date starts + // at Feb 1, 1999, + // along these same lines: (these are in MM-DD-YYYY format) + // Startdates: Date Becomes + // 01-01 01-01-1970 + // 01-1970 01-01-1970 + // 04-1970 04-01-1970 + // 1970 01-01-1970 + // These things seem to work fine for start dates, as all months have + // the same first day however the ending date can't work this way. + + if(config->Value("startday")) // form input specified a start day + { + t = config->Value("startday"); + if (t < 0) + { + time_t then = now + (t * (24*60*60)); + lt = localtime(&then); + startdate.tm_mday = lt->tm_mday; + startdate.tm_mon = lt->tm_mon; + startdate.tm_year = lt->tm_year; + } + else + startdate.tm_mday = t; + // tm days are 1 based, they are passed in as 1 based + } + else if (!reldate) + startdate.tm_mday = 1; // otherwise, no start day, default to 1 + + if(config->Value("startmonth")) // form input specified a start month + { + t = config->Value("startmonth"); + if (t < 0) + startdate.tm_mon += t; + else + startdate.tm_mon = t - 1; + // tm months are zero based. They are passed in as 1 based + while (startdate.tm_mon < 0) + { + startdate.tm_mon += 12; + startdate.tm_year--; + } + } + else if (!reldate) + startdate.tm_mon = 0; // otherwise, no start month, default to 0 + + // year is handled a little differently... the tm_year structure + // wants the tm_year in a format of year - 1900. + // since we are going to convert these dates to a time_t, + // a time_t value of zero, the earliest possible date + // occurs Jan 1, 1970. If we allow dates < 1970, then we + // could get negative time_t values right??? + // (barring minor timezone offsets west of GMT, where Epoch is 12-31-69) + + if(config->Value("startyear")) // form input specified a start year + { + t = config->Value("startyear"); + if (t < 0) + startdate.tm_year += t; + else + { + startdate.tm_year = config->Value("startyear") - 1900; + if (startdate.tm_year < 69-1900) // correct for 2-digit years 00-68 + startdate.tm_year += 2000; // - Gilles's fix + if (startdate.tm_year < 0) // correct for 2-digit years 69-99 + startdate.tm_year += 1900; + } + } + else if (!reldate) + startdate.tm_year = 1970-1900; + // otherwise, no start day, specify start at 1970 + + reldate = ((config->Value("endmonth") < 0) || + (config->Value("endday") < 0) || + (config->Value("endyear") < 0)); + + // set up the enddate structure + enddate.tm_sec = 59; // allow up to last second of end day + enddate.tm_min = 59; // - Gilles's fix + enddate.tm_hour = 23; + enddate.tm_yday = 0; + enddate.tm_wday = 0; + + if(config->Value("endday") < 0) // form input specified relative end day + { + // relative end day must be done before month or year + t = config->Value("endday"); + time_t then = now + (t * (24*60*60)); + lt = localtime(&then); + enddate.tm_mday = lt->tm_mday; + enddate.tm_mon = lt->tm_mon; + enddate.tm_year = lt->tm_year; + } + + if(config->Value("endmonth")) // form input specified an end month + { + t = config->Value("endmonth"); + if (t < 0) + enddate.tm_mon += t; + else + enddate.tm_mon = t - 1; + // tm months are zero based. They are passed in as 1 based + while (enddate.tm_mon < 0) + { + enddate.tm_mon += 12; + enddate.tm_year--; + } + } + else if (!reldate) + enddate.tm_mon = 11; // otherwise, no end month, default to 11 + + if(config->Value("endyear")) // form input specified a end year + { + t = config->Value("endyear"); + if (t < 0) + enddate.tm_year += t; + else + { + enddate.tm_year = config->Value("endyear") - 1900; + if (enddate.tm_year < 69-1900) // correct for 2-digit years 00-68 + enddate.tm_year += 2000; // - Gilles's fix + if (enddate.tm_year < 0) // correct for 2-digit years 69-99 + enddate.tm_year += 1900; + } + } + else if (!reldate) + enddate.tm_year = endoftime.tm_year; + // otherwise, no end year, specify end at the end of time allowable + + // Months have different number of days, and this makes things more + // complicated than the startdate range. + // Following the example above, here is what we want to happen: + // Enddates: Date Becomes + // 04-31 04-31-endoftime.tm_year + // 05-1999 05-31-1999, may has 31 days... we want to search until the end of may so... + // 1999 12-31-1999, search until the end of the year + + if(config->Value("endday") > 0) // form input specified an end day + { + enddate.tm_mday = config->Value("endday"); + // tm days are 1 based, they are passed in as 1 based + } + else if (!reldate) + { + // otherwise, no end day, default to the end of the month + enddate.tm_mday = monthdays[enddate.tm_mon]; + if (enddate.tm_mon == 1) // February, so check for leap year + if (((enddate.tm_year+1900) % 4 == 0 && + (enddate.tm_year+1900) % 100 != 0) || + (enddate.tm_year+1900) % 400 == 0) + enddate.tm_mday += 1; // Feb. 29 - Gilles's fix + } + + // Convert the tm values into time_t values. + // Web servers specify modification times in GMT, but htsearch + // displays these modification times in the server's local time zone. + // For consistency, we would prefer to select based on this same + // local time zone. - Gilles's fix + + timet_startdate = mktime(&startdate); + timet_enddate = mktime(&enddate); + + // I'm not quite sure what behavior I want to happen if + // someone reverses the start and end dates, and one of them is invalid. + // for now, if there is a completely invalid date on the start or end + // date, I will force the start date to time_t 0, and the end date to + // the maximum that can be handled by a time_t. + + if(timet_startdate < 0) + timet_startdate = 0; + if(timet_enddate < 0) + timet_enddate = eternity; + + // what if the user did something really goofy like choose an end date + // that's before the start date + + if(timet_enddate < timet_startdate) // if so, then swap them so they are in order + { + time_t timet_temp = timet_enddate; + timet_enddate = timet_startdate; + timet_startdate = timet_temp; + } + } + else // no date was specifed, so plug in some defaults + { + timet_startdate = 0; + timet_enddate = eternity; + } + + // ... MG + + + URLSeedScore adjustments(*config); + + // If we knew where to pass it, this would be a good place to pass + // on errors from adjustments.ErrMsg(). + +// Deal with all collections +// + selected_collections->Start_Get(); + Collection *collection= NULL; + while ((collection = (Collection *) selected_collections->Get_NextElement())) + { + ResultList *results = collection->getResultList(); + if (results == NULL) + continue; + + results->Start_Get(); + while ((cpid = results->Get_Next())) + { + int id = atoi(cpid); + + // DocumentRef *thisRef = docDB[id]; + + DocMatch *dm = results->find(cpid); + Collection *collection = NULL; + if (dm) + collection = dm->collection; + if (collection == NULL) continue; + DocumentRef *thisRef = collection->getDocumentRef(id); + + // + // If it wasn't there, then ignore it + // + if (thisRef == 0) + { + continue; + } + + url = thisRef->DocURL(); + HtURLRewriter::instance()->replace(url); + if (!includeURL(url.get())) + { + // Get rid of it to free the memory! + delete thisRef; + + continue; + } + + // Code added by Mike Grommet for date search ranges + // check for valid date range. toss it out if it isn't relevant. + if ((timet_startdate > 0 || timet_enddate < eternity) && + (thisRef->DocTime() < timet_startdate || thisRef->DocTime() > timet_enddate)) + { + delete thisRef; + continue; + } + + thisMatch = ResultMatch::create(); + thisMatch->setID(id); + thisMatch->setCollection(collection); + + // + // Assign the incomplete score to this match. This score was + // computed from the word database only, no excerpt context was + // known at that time, or info about the document itself, + // so this still needs to be done. + // + + // Moved up: DocMatch *dm = results->find(cpid); + double score = dm->score; + + // We need to scale based on date relevance and backlinks + // Other changes to the score can happen now + // Or be calculated by the result match in getScore() + + // This formula derived through experimentation + // We want older docs to have smaller values and the + // ultimate values to be a reasonable size (max about 100) + + base_score = score; + if (date_factor != 0.0) + { + +// Macro for calculating the date factor (31536000 is the number of +// seconds in a 365 days year). The formula gives less weight +// as the distance between the date document and the current time +// increases (the absolute value is for documents with future date) +#define DATE_FACTOR(df, n, dd) ((df) * 100 / (1+(double)(abs((n) - (dd)) / 31536000))) + date_score = DATE_FACTOR(date_factor, now, thisRef->DocTime()); + score += date_score; + } + + if (backlink_factor != 0.0) + { + int links = thisRef->DocLinks(); + if (links == 0) + links = 1; // It's a hack, but it helps... + + backlink_score = backlink_factor + * (thisRef->DocBackLinks() / (double)links); + score += backlink_score; + } + + if (debug) { + cerr << thisRef->DocURL() << "\n"; + } + + thisMatch->setTime(thisRef->DocTime()); + thisMatch->setTitle(thisRef->DocTitle()); + + score = adjustments.adjust_score(score, thisRef->DocURL()); + + score = log(1.0 + score); + thisMatch->setScore(score); + thisMatch->setAnchor(dm->anchor); + + // + // Append this match to our list of matches. + // + if (score > 0.0) + matches.Add(thisMatch, thisRef->DocURL()); + + // Get rid of it to free the memory! + delete thisRef; + + if (debug) + { + cerr << " base_score " << base_score << " date_score " << date_score << " backlink_score " << backlink_score << "\n"; + cerr << " score " << score << "(" << thisMatch->getScore() << "), maxScore " << maxScore <<", minScore " << minScore << endl; + } + + if (maxScore < score) + {if(debug) cerr << "Set maxScore = score" <<endl; + maxScore = score; + } + if (minScore > score && score > 0.0) + {if(debug) cerr << "Set minScore = score" <<endl; + minScore = score; + } + } + } + + // + // Each sub-area is then sorted by relevance level. + // + List *matches_part; // Outside of loop to keep for-scope warnings away. + for (matches_part = matches.Get_First(); + matches_part != 0; + matches_part = matches.Get_Next()) + sort(matches_part); + + // Then all sub-lists are concatenated and put in a new list. + return matches.JoinedLists(); +} + +//***************************************************************************** +String * +Display::excerpt(ResultMatch *match, DocumentRef *ref, String urlanchor, int fanchor, int &first) +{ + HtConfiguration* config= HtConfiguration::config(); + // It is necessary to keep alive the String you .get() a char * from, + // as long as you use the char *. + + //String head_string; + + char *head; + int use_meta_description=0; + Collection *collection = match->getCollection(); + + if (config->Boolean("use_meta_description",0) + && strlen(ref->DocMetaDsc()) != 0) + { + // Set the head to point to description + head = ref->DocMetaDsc(); + use_meta_description = 1; + } + else + { + // docDB.ReadExcerpt(*ref); + collection->ReadExcerpt(*ref); + head = ref->DocHead(); // head points to the top + } + + //head_string = HtSGMLCodec::instance()->decode(head); + //head = head_string.get(); + + int which, length; + char *temp = head; + String part; + String *text = new String(""); + + StringMatch *allWordsPattern = NULL; + if (collection) + allWordsPattern = collection->getSearchWordsPattern(); + if (!allWordsPattern) + return text; + + // htsearch displays the description when: + // 1) a description has been found + // 2) the option "use_meta_description" is set to true + // If previous conditions are false and "excerpt_show_top" is set to true + // it shows the whole head. Else, it acts as default. + + if (config->Boolean("excerpt_show_top", 0) || use_meta_description || + !allWordsPattern->hasPattern()) + first = 0; + else + first = allWordsPattern->FindFirstWord(head, which, length); + + if (first < 0 && config->Boolean("no_excerpt_show_top")) + first = 0; // No excerpt, but we want to show the top. + + if (first < 0) + { + // + // No excerpt available, don't show top, so display message + // + if (!config->Find("no_excerpt_text").empty()) + { + *text << config->Find("no_excerpt_text"); + } + } + else + if ( first == 0 || config->Value( "max_excerpts" ) == 1 ) + { + int headLength = strlen(head); + int length = config->Value("excerpt_length", 50); + char *start; + char *end; + WordType type(*config); + + if (!config->Boolean("add_anchors_to_excerpt")) + // negate flag if it's on (anchor available) + fanchor = 0; + + // + // Figure out where to start the excerpt. Basically we go back + // half the excerpt length from the first matched word + // + start = &temp[first] - length / 2; + if (start < temp) + start = temp; + else + { + *text << config->Find("start_ellipses"); + while (*start && type.IsStrictChar(*start)) + start++; + } + + // + // Figure out the end of the excerpt. + // + end = start + length; + if (end > temp + headLength) + { + end = temp + headLength; + *text << hilight(match, start, urlanchor, fanchor); + } + else + { + while (*end && type.IsStrictChar(*end)) + end++; + *end = '\0'; + *text << hilight(match, start, urlanchor, fanchor); + *text << config->Find("end_ellipses"); + } + } + else + { + *text = buildExcerpts( allWordsPattern, match, head, urlanchor, fanchor ); + } + + return text; +} +// +//***************************************************************************** +// Handle cases where multiple document excerpts are requested. +// +const String +Display::buildExcerpts( StringMatch *allWordsPattern, ResultMatch* match, char *head, String urlanchor, int fanchor ) +{ + HtConfiguration* config= HtConfiguration::config(); + if ( !config->Boolean( "add_anchors_to_excerpt" ) ) + { + fanchor = 0; + } + + int headLength = strlen( head ); + int excerptNum = config->Value( "max_excerpts", 1 ); + int excerptLength = config->Value( "excerpt_length", 50 ); + int lastPos = 0; + int curPos = 0; + + String text; + + for ( int i = 0; i < excerptNum; ++i ) + { + int which, termLength; + + int nextPos = allWordsPattern->FindFirstWord( head + lastPos, + which, termLength ); + + if ( nextPos < 0 ) + { + // Ran out of matching terms + break; + } + else + { + // Determine offset from beginning of head + curPos = lastPos + nextPos; + } + + // Slip a break in since there is another excerpt coming + if ( i != 0 ) + { + text << "<br>"; + } + + // Determine where excerpt starts + char *start = &head[curPos] - excerptLength / 2; + + if ( start < head ) + { + start = head; + } + else + { + text << config->Find("start_ellipses"); + + while ( *start && HtIsStrictWordChar( *start ) ) + { + start++; + } + } + + // Determine where excerpt ends + char *end = start + excerptLength; + + if ( end > head + headLength ) + { + end = head + headLength; + + text << hilight( match, start, urlanchor, fanchor ); + } + else + { + while ( *end && HtIsStrictWordChar( *end ) ) + { + end++; + } + + // Save end char so that it can be restored + char endChar = *end; + + *end = '\0'; + + text << hilight(match, start, urlanchor, fanchor); + text << config->Find("end_ellipses"); + + *end = endChar; + } + + // No more words left to examine in head + if ( (lastPos = curPos + termLength) > headLength ) + break; + } + + return text; +} + +//***************************************************************************** +String +Display::hilight(ResultMatch *match, const String& str_arg, const String& urlanchor, int fanchor) +{ + HtConfiguration* config= HtConfiguration::config(); + const String start_highlight = config->Find("start_highlight"); + const String end_highlight = config->Find("end_highlight"); + const String anchor_target = config->Find("anchor_target"); + const char *str = str_arg; + String result; + int pos = 0; + int which, length; + WeightWord *ww; + int first = 1; + String s; +#define SGMLencodedChars(p, l) (s = 0, s.append(p, l), HtSGMLCodec::instance()->decode(s)) + + result = 0; + Collection *collection = match->getCollection(); + StringMatch *allWordsPattern = NULL; + if (collection) + allWordsPattern = collection->getSearchWordsPattern(); + List *searchWords = NULL; + if (collection) + searchWords = collection->getSearchWords(); + if (!allWordsPattern || !searchWords) + return result; + + while (allWordsPattern->hasPattern() && + (pos = allWordsPattern->FindFirstWord(str, which, length)) >= 0) + { + //result.append(str, pos); + result << SGMLencodedChars(str, pos); + ww = (WeightWord *) (*searchWords)[which]; + result << start_highlight; + if (first && fanchor) + { + result << "<a "; + if (anchor_target.length() > 0) + result << "target=\"" << anchor_target << "\" "; + result << "href=\"" << urlanchor << "\">"; + } + //result.append(str + pos, length); + result << SGMLencodedChars(str + pos, length); + if (first && fanchor) + result << "</a>"; + result << end_highlight; + str += pos + length; + first = 0; + } + //result.append(str); + result << SGMLencodedChars(str, strlen(str)); + return result; +} + +//***************************************************************************** +void +Display::sort(List *matches) +{ + HtConfiguration* config= HtConfiguration::config(); + int numberOfMatches = matches->Count(); + int i; + + if (numberOfMatches <= 1) + return; + + ResultMatch **array = new ResultMatch*[numberOfMatches]; + for (i = 0; i < numberOfMatches; i++) + { + array[i] = (ResultMatch *)(*matches)[i]; + } + matches->Release(); + + qsort((char *) array, numberOfMatches, sizeof(ResultMatch *), + array[0]->getSortFun()); + + const String st = config->Find("sort"); + if (!st.empty() && mystrncasecmp("rev", st, 3) == 0) + { + for (i = numberOfMatches; --i >= 0; ) + matches->Add(array[i]); + } + else + { + for (i = 0; i < numberOfMatches; i++) + matches->Add(array[i]); + } + delete [] array; +} + +//***************************************************************************** +void +Display::logSearch(int page, List *matches) +{ +//Note: This is Posix and dependent on a running syslogd.. +//does not work for Win32 +//TODO: Look into using native windows system logs instead +#ifndef _MSC_VER /* _WIN32 */ + HtConfiguration* config= HtConfiguration::config(); + // Currently unused time_t t; + int nMatches = 0; + int level = LOG_LEVEL; + int facility = LOG_FACILITY; + char *host = getenv("REMOTE_HOST"); + char *ref = getenv("HTTP_REFERER"); + + if (host == NULL) + host = getenv("REMOTE_ADDR"); + if (host == NULL) + host = "-"; + + if (ref == NULL) + ref = "-"; + + if (matches) + nMatches = matches->Count(); + + openlog("htsearch", LOG_PID, facility); + syslog(level, "%s [%s] (%s) [%s] [%s] (%d/%s) - %d -- %s\n", + host, + input->exists("config") ? input->get("config") : "default", + (const char*)config->Find("match_method"), + input->exists("words") ? input->get("words") : "", + logicalWords.get(), + nMatches, (const char*)config->Find("matches_per_page"), + page, ref + ); +#endif +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Display.h b/debian/htdig/htdig-3.2.0b6/htsearch/Display.h new file mode 100644 index 00000000..2d144ab1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Display.h @@ -0,0 +1,238 @@ +// +// Display.h +// +// Display: Takes results of search and fills in the HTML templates +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Display.h,v 1.27 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _Display_h_ +#define _Display_h_ + +#include "Object.h" +#include "ResultList.h" +#include "ResultMatch.h" +#include "TemplateList.h" +#include "cgi.h" +#include "StringMatch.h" +#include "List.h" +#include "DocumentDB.h" +#include "Database.h" +#include "Dictionary.h" +#include "HtRegex.h" + +class Display : public Object +{ +public: + // + // Construction/Destruction + // + // Display(const String& docFile, const String& indexFile, const String& excerptFile); + + Display(Dictionary *selected_collections); + ~Display(); + + void setStartTemplate(const String& templateName); + void setMatchTemplate(const String& templateName); + void setEndTemplate(const String& templateName); + + // inline void setResults(ResultList *results); + // inline void setSearchWords(List *searchWords); + inline void setLimit(HtRegex *); + inline void setExclude(HtRegex *); + // inline void setAllWordsPattern(StringMatch *); + inline void setLogicalWords(char *); + inline void setOriginalWords(char *); + inline void setCGI(cgi *); + + void display(int pageNumber); + void displayMatch(ResultMatch *match, DocumentRef *ref, int current); + void displayHTTPheaders(); + void displayHeader(); + void displayFooter(); + void displayNomatch(); + void displaySyntaxError(const String &); + + int hasTemplateError() {return templateError;} + +protected: + // + // Multiple database support + // + Dictionary *selected_collections; + + // + // Search Policy + char *search_policy; + + // + // The list of search results. + // + // ResultList *results; + + // + // The database that contains documents. + // + // DocumentDB docDB; + + // + // A list of words that we are searching for + // + // List *searchWords; + + // + // Pattern that all result URLs must match or exclude + // + HtRegex *limitTo; + HtRegex *excludeFrom; + + // + // Pattern of all the words + // + // StringMatch *allWordsPattern; + + // + // Variables for substitution into text are stored in a dictionary + // + Dictionary vars; + + // + // Since the creation of excerpts is somewhat time consuming, we will + // only compute them if they're actually going to be used. This is the + // flag that tells us if we will need the excerpt. + // + int needExcerpt; + + // + // Since we might have errors we cannot recover from, this tells us + // what happened. + // + int templateError; + + // + // To allow the result templates to be dependant on the match URL, we need + // the following: + // + StringMatch URLtemplate; + List URLtemplateList; + + // + // To allow the star images to be dependant on the match URL, we need + // the following: + // + StringMatch URLimage; + List URLimageList; + + // + // Maximum number of stars to display + // + int maxStars; + double maxScore; + double minScore; + + // + // For display, we have different versions of the list of words. + // + String logicalWords; + String originalWords; + + // + // To be able to recreate the URL that will get to us again, we need + // the info from the HTML form that called us. + // + cgi *input; + + // + // Match output is done through templates. This is the interface to these + // templates. + // + TemplateList templates; + Template *currentTemplate; + + // + // Methods... + // + List *buildMatchList(); + void sort(List *); + + int includeURL(const String&); + String *readFile(const String&); + void expandVariables(const String&); + void outputVariable(const String&); + String *excerpt(ResultMatch *match, DocumentRef *ref, + String urlanchor, int fanchor, int &first); + const String buildExcerpts(StringMatch *allWordsPattern, + ResultMatch *match, char *head, + String urlanchor, int fanchor ); + String hilight(ResultMatch *match, const String& str, + const String& urlanchor, int fanchor); + void setupTemplates(); + void setupImages(); + String *generateStars(DocumentRef *, int); + void displayParsedFile(const String&); + void setVariables(int, List *); + void createURL(String &, int); + void logSearch(int, List *); +}; + +//***************************************************************************** +inline void +Display::setLimit(HtRegex *limit) +{ + limitTo = limit; +} + +inline void +Display::setExclude(HtRegex *exclude) +{ + excludeFrom = exclude; +} + +#if 0 +inline void +Display::setAllWordsPattern(StringMatch *pattern) +{ + allWordsPattern = pattern; +} + +inline void +Display::setResults(ResultList *results) +{ + this->results = results; +} + +inline void +Display::setSearchWords(List *searchWords) +{ + this->searchWords = searchWords; +} +#endif + +inline void +Display::setLogicalWords(char *s) +{ + logicalWords = s; + vars.Add("LOGICAL_WORDS", new String(logicalWords)); +} + +inline void +Display::setOriginalWords(char *s) +{ + originalWords = s; + vars.Add("WORDS", new String(originalWords)); +} + +inline void +Display::setCGI(cgi *aCgi) +{ + input = aCgi; +} + +#endif + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.cc b/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.cc new file mode 100644 index 00000000..575c82aa --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.cc @@ -0,0 +1,222 @@ +// DocMatch.cc +// +// DocMatch: Data object only. Contains information related to a given +// document that was matched by a search. For instance, the +// score of the document for this search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocMatch.cc,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "DocMatch.h" +#include "HtConfiguration.h" +#include "HtWordReference.h" + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +//******************************************************************************* +// DocMatch::DocMatch() +// + + +//******************************************************************************* +// DocMatch::~DocMatch() +// +DocMatch::~DocMatch() +{ +} + +// +// merge with another match +// sets anchor to the lower value +// merges location lists +// +void +DocMatch::Merge(const DocMatch &match) +{ + if(match.anchor < anchor) + { + anchor = match.anchor; + } + AddLocations(match.GetLocations()); +} + +// +// adds locations to an existing list +// avoiding duplicates, in location order +// +void +DocMatch::AddLocations(const List *locs) +{ + List *merge = new List; + ListCursor c; + + locations->Start_Get(); + locs->Start_Get(c); + Location *a = (Location *)locations->Get_Next(); + Location *b = (Location *)locs->Get_Next(c); + while(a && b) + { + if(a->from < b->from) + { + merge->Add(a); + a = (Location *)locations->Get_Next(); + } + else if(a->from > b->from) + { + merge->Add(new Location(*b)); + b = (Location *)locs->Get_Next(c); + } + else // (a->from == b->from) + { + if(a->to < b->to) + { + merge->Add(new Location(*a)); + merge->Add(new Location(*b)); + } + else if(a->to > b->to) + { + merge->Add(new Location(*b)); + merge->Add(new Location(*a)); + } + else // (a->to == b->to) + { + merge->Add(new Location( + a->from, + a->to, + a->flags, + a->weight + b->weight)); + } + a = (Location *)locations->Get_Next(); + b = (Location *)locs->Get_Next(c); + } + } + while(a) + { + merge->Add(a); + a = (Location *)locations->Get_Next(); + } + while(b) + { + merge->Add(new Location(*b)); + b = (Location *)locs->Get_Next(c); + } + locations->Release(); + delete locations; + locations = merge; +} + +// +// set the location list +// +void +DocMatch::SetLocations(List *locs) +{ + delete locations; + locations = locs; +} + +// +// copy constructor, copies locations +// +DocMatch::DocMatch(const DocMatch &other) +{ + score = -1.0; + //score = other.score; + id = other.id; + anchor = other.anchor; + locations = new List; + AddLocations(other.GetLocations()); +} + +// +// set weight of all locations +// +void +DocMatch::SetWeight(double weight) +{ + locations->Start_Get(); + for(int i = 0; i < locations->Count(); i++) + { + Location *loc = (Location *)locations->Get_Next(); + loc->weight = weight; + } +} + +// +// debug dump +// +void +DocMatch::Dump() +{ + cerr << "DocMatch id: " << id << " {" << endl; + locations->Start_Get(); + for(int i = 0; i < locations->Count(); i++) + { + Location *loc = (Location *)locations->Get_Next(); + cerr << "location [" << loc->from; + cerr << ", " << loc->to << "] "; + cerr << "weight " << loc->weight; + cerr << " flags " << loc->flags; + cerr << endl; + } + cerr << "score: " << GetScore() << endl << "}" << endl; +} + +double +DocMatch::GetScore() +{ + HtConfiguration* config= HtConfiguration::config(); + static double text_factor = config->Double("text_factor", 1); + static double caps_factor = config->Double("caps_factor", 1); + static double title_factor = config->Double("title_factor", 1); + static double heading_factor = config->Double("heading_factor", 1); + static double keywords_factor = config->Double("keywords_factor", 1); + static double meta_desc_factor = config->Double("meta_description_factor", 1); + static double author_factor = config->Double("author_factor", 1); + static double description_factor = config->Double("description_factor", 1); + static double url_text_factor = config->Double("url_text_factor", 1); + + if (score == -1.0) + { + score = 0.0; + + double locresult = 0.0; + ListCursor c; + locations->Start_Get(c); + Location *loc = (Location *)locations->Get_Next(c); + while(loc) + { + locresult = 0.0; + if (loc->flags == FLAG_TEXT) locresult += text_factor; + if (loc->flags & FLAG_CAPITAL) locresult += caps_factor; + if (loc->flags & FLAG_TITLE) locresult += title_factor; + if (loc->flags & FLAG_HEADING) locresult += heading_factor; + if (loc->flags & FLAG_KEYWORDS) locresult += keywords_factor; + if (loc->flags & FLAG_DESCRIPTION) locresult += meta_desc_factor; + if (loc->flags & FLAG_AUTHOR) locresult += author_factor; + if (loc->flags & FLAG_LINK_TEXT) locresult += description_factor; + if (loc->flags & FLAG_URL) locresult += url_text_factor; + + score += loc->weight * locresult; + loc = (Location *)locations->Get_Next(c); + } + } + return score; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.h b/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.h new file mode 100644 index 00000000..798aadb3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/DocMatch.h @@ -0,0 +1,109 @@ +// +// DocMatch.h +// +// DocMatch: Data object only. Contains information related to a given +// document that was matched by a search. For instance, the +// score of the document for this search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: DocMatch.h,v 1.9 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _DocMatch_h_ +#define _DocMatch_h_ + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "Object.h" +#include "List.h" + +class Collection; + +// +// an element of the DocMatch location list +// +struct Location : public Object +{ + Location(int f, int t, unsigned int l, double w = 1.0) : + from(f), to(t), flags(l), weight(w) {} + Location(const Location &l) : + from(l.from), to(l.to), flags(l.flags), weight(l.weight) {} + int from; + int to; + unsigned int flags; + double weight; +}; + +// +// an element of a ResultList +// +class DocMatch : public Object +{ +public: + // default constructor + DocMatch() : + locations(new List), + score(-1.0), + id(0), + anchor(0), + collection(0) {} + + // copy constructor + DocMatch(const DocMatch &); + + // destructor + ~DocMatch(); + + // match join + void Merge(const DocMatch &); + + // score accessor + double GetScore(); + void SetScore(double); + + // doc id accessors + int GetId() const { return id; } + void SetId(int x) { id = x; } + + // anchor accessors + int GetAnchor() const { return anchor; } + void SetAnchor(int x) { anchor = x; } + + // location list accessors + const List *GetLocations() const { return locations; } + void SetLocations(List *); + void AddLocations(const List *); + + // add one location to the list + // use with caution -- does not ensure {ordered} + void AddLocation(Location *x) { locations->Add(x); } + + // set weight of all locations + void SetWeight(double weight); + + // debug + void Dump(); + +private: + List *locations; +// the rest should be private: +// but is already used by the old htsearch +public: + + double score; + int id; + int anchor; + short int orMatches; + Collection *collection; // Multiple databases +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.cc new file mode 100644 index 00000000..1bbb4b3a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.cc @@ -0,0 +1,53 @@ +// +// ExactWordQuery.cc +// +// ExactWordQuery: A Query tree leaf object. Wraps a database access +// that generates ResultLists for word matches. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExactWordQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "ExactWordQuery.h" +#include "WordSearcher.h" + +// +// the searcher object used by all instances +// of ExactWord +// +WordSearcher * +ExactWordQuery::searcher = 0; + +// +// set the weight of the matches to my weight +// +void +ExactWordQuery::AdjustWeight(ResultList &results) +{ + results.SetWeight(weight); +} + +// +// tell the searcher to fetch my word in the database +// return 0 if no matches +// +ResultList * +ExactWordQuery::Evaluate() +{ + ResultList *result = 0; + if(searcher) + { + result = searcher->Search(word); + } + if(result && !result->Count() && !result->IsIgnore()) + { + delete result; + result = 0; + } + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.h new file mode 100644 index 00000000..fafcb878 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ExactWordQuery.h @@ -0,0 +1,71 @@ +#ifndef _ExactWordQuery_h_ +#define _ExactWordQuery_h_ + +// +// ExactWordQuery.h +// +// ExactWordQuery: A Query tree leaf object. Wraps a database access +// that generates ResultLists for word matches. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ExactWordQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "Query.h" + +class WordSearcher; + +class ExactWordQuery : public Query +{ +public: + // construct for word w + ExactWordQuery(const String &w) : + word(w), weight(1.0) {} + + // destruct + ~ExactWordQuery() {} + + // set the common db wrapper + static void SetSearcher(WordSearcher *c) { searcher = c; } + + // weight accessor + void SetWeight(double x) { weight = x; } + double GetWeight() const { return weight; } + +private: + // forbidden + ExactWordQuery() {} + + // go search the db + ResultList *Evaluate(); + + // set my weight to the list + void AdjustWeight(ResultList &); + + // unparse + String GetLogicalWords() const { return word; } + + // unique cache index + String GetSignature() const + { return String("Exact:")+GetLogicalWords(); } + + // i represent this + String word; + + // my weight + double weight; + + // db wrapper common to all word queries + static WordSearcher *searcher; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/FuzzyExpander.h b/debian/htdig/htdig-3.2.0b6/htsearch/FuzzyExpander.h new file mode 100644 index 00000000..0544d814 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/FuzzyExpander.h @@ -0,0 +1,46 @@ +#ifndef _FuzzyExpander_h_ +#define _FuzzyExpander_h_ + +// +// FuzzyExpander.h +// +// FuzzyExpander: (abstract) root of a family of query factories. +// They make fuzzy queries for given words +// and store word weights to results +// by using the existing fuzzy algorithms +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: FuzzyExpander.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +// +// for details about the basic architectural pattern see the book: +// Design Patterns, by the infamous GoF +// Factory pattern +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "htString.h" + +class Query; + +// abstract +class FuzzyExpander +{ +public: + FuzzyExpander() {} + virtual ~FuzzyExpander() {} + + // generate a query for this word + virtual Query *MakeQuery(const String &word) = 0; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/GParser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/GParser.cc new file mode 100644 index 00000000..abf9dbac --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/GParser.cc @@ -0,0 +1,134 @@ +// +// GParser.cc +// +// GParser: An alternate boolean parser, does not use operator precedence. +// -- but why is it called G? :-) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: GParser.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "GParser.h" +#include "OrQuery.h" +#include "NearQuery.h" +#include "AndQuery.h" +#include "NotQuery.h" + +Query * +GParser::ParseFactor() +{ + Query *result = 0; + + if(token.IsWord()) + { + result = ParseWord(); + } + else if(token.IsQuote()) + { + token.Next(); + result = ParsePhrase(); + if(result) + { + if(token.IsQuote()) + { + token.Next(); + } + else + { + Expected("closing \""); + delete result; + result = 0; + } + } + } + else if(token.IsLeftParen()) + { + token.Next(); + result = ParseExpression(); + if(result) + { + if(token.IsRightParen()) + { + token.Next(); + } + else + { + Expected(")"); + delete result; + result = 0; + } + } + } + else + { + Expected("'(', '\"', or a word"); + } + return result; +} + +OperatorQuery * +GParser::MakeOperatorQuery(const String &op) const +{ +cerr << "Making operator for " << op << endl; + OperatorQuery *result = 0; + if(op == String("or")) + { + result = new OrQuery; + } + else if(op == String("and")) + { + result = new AndQuery; + } + else if(op == String("not")) + { + result = new NotQuery; + } + else if(op == String("near")) + { + result = new NearQuery; + } + return result; +} + + +Query * +GParser::ParseExpression() +{ + List factors; + Query *result = 0; + String op = ""; + + Query *factor = ParseFactor(); + if(factor) + { + result = factor; + } + while(factor && (token.IsOr() || token.IsAnd() || token.IsNot() || token.IsNear())) + { + if(op != token.Value()) + { + Query *previous = result; + result = MakeOperatorQuery(token.Value()); + result->Add(previous); + op = token.Value(); + } + token.Next(); + factor = ParseFactor(); + if(factor) + { + result->Add(factor); + } + } + if(!factor && result) + { + delete result; + result = 0; + } + return result; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/GParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/GParser.h new file mode 100644 index 00000000..d66bdcd2 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/GParser.h @@ -0,0 +1,47 @@ +#ifndef _GParser_h_ +#define _GParser_h_ + +// +// GParser.h +// +// GParser: An alternate boolean parser, does not use operator precedence. +// -- but why is it called G? :-) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: GParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryParser.h" +#include "BooleanLexer.h" + +class OperatorQuery; + +class GParser : public QueryParser +{ +public: + GParser() {} + ~GParser() {} + +private: + // apply the single-level syntax + Query *ParseExpression(); + + // apply the factor syntax + Query *ParseFactor(); + + // return the adequate operator for an operator keyword + OperatorQuery *MakeOperatorQuery(const String &op) const; + + // give the parent access to the lexical analyzer + QueryLexer &Token() { return token; } + + // the lexer + BooleanLexer token; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.cc b/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.cc new file mode 100644 index 00000000..d372d22c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.cc @@ -0,0 +1,215 @@ +// +// HtURLSeedScore.cc +// +// URLSeedScore: +// Holds a list of configured adjustments to be applied on a given +// score and given URL. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: HtURLSeedScore.cc,v 1.6 2004/05/28 13:15:24 lha Exp $ + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "StringList.h" +#include "HtRegex.h" +#include "HtURLSeedScore.h" + +#include <stdio.h> +#include <ctype.h> + +// This class is only used in private members of URLSeedScore. +// The OO-right thing would be to nest this inside the private +// declaration of HtURLSeedScore, but that would cause portability +// problems according to +// <URL:http://www.mozilla.org/hacking/portable-cpp.html#inner_classes>. + +class ScoreAdjustItem : public Object +{ +public: + // Construct from a string applicable to StringMatch, and a string to + // parse for a formula. + ScoreAdjustItem(String &, String &); + + ~ScoreAdjustItem(); + + // Does this item match? + inline bool Match(const String &s) { return match.match(s, 1, 0) != 0; } + + // Return the argument adjusted according to this item. + double adjust_score(double orig) + { return orig*my_mul_factor + my_add_constant; } + + // Error in parsing? Message given here if non-empty string. + String& ErrMsg() { return myErrMsg; } + +private: + double my_add_constant; + double my_mul_factor; + HtRegex match; + + static String myErrMsg; + + // These member functions are not supposed to be implemented, but + // mentioned here as private so the compiler will not generate them if + // someone puts in buggy code that would use them. + ScoreAdjustItem(); + ScoreAdjustItem(const ScoreAdjustItem &); + void operator= (const ScoreAdjustItem &); +}; + +// Definition of myErrMsg. +String ScoreAdjustItem::myErrMsg(""); + +ScoreAdjustItem::ScoreAdjustItem(String &url_regex, String &formula) +{ + double mul_factor = 1; + double add_constant = 0; + bool factor_found = false; + bool constant_found = false; + int chars_so_far; + StringList l(url_regex.get(), '|'); + match.setEscaped(l); + + // FIXME: Missing method to check if the regex was in error. + // myErrMsg = form("%s is not a valid regex", url_regex.get()); + + char *s = formula.get(); + + // Parse the ([*]N[ ]*)?[+]?M format. + if (s[0] == '*') + { + // Skip past the '*'. + s++; + + // There is a mul_factor. Let's parse it. + chars_so_far = 0; + sscanf(s, "%lf%n", &mul_factor, &chars_so_far); + + // If '%lf' failed to match, then it will show up as either no + // assignment to chars_so_far, or as writing 0 there. + if (chars_so_far == 0) + { + myErrMsg = form("%s is not a valid adjustment formula", s); + return; + } + + // Skip past the number. + s += chars_so_far; + + // Skip any whitespaces. + while (isspace(*s)) + s++; + + // Eat any plus-sign; it's redundant if alone, and may come before a + // minus. + if (*s == '+') + s++; + + factor_found = true; + } + + // If there's anything here, it must be the additive constant. + if (*s) + { + chars_so_far = 0; + sscanf(s, "%lf%n", &add_constant, &chars_so_far); + + // If '%lf' failed to match, then it will show up as either no + // assignment to chars_so_far, or as writing 0 there. + // We also need to check that it was the end of the input. + if (chars_so_far == 0 || s[chars_so_far] != 0) + { + myErrMsg = form("%s is not a valid adjustment formula", + formula.get()); + return; + } + + constant_found = true; + } + + // Either part must be there. + if (!factor_found && !constant_found) + { + myErrMsg = form("%s is not a valid formula", formula.get()); + return; + } + + my_add_constant = add_constant; + my_mul_factor = mul_factor; +} + +ScoreAdjustItem::~ScoreAdjustItem() +{ +} + +URLSeedScore::URLSeedScore(Configuration &config) +{ + char *config_item = "url_seed_score"; + + StringList sl(config[config_item], "\t \r\n"); + + myAdjustmentList = new List(); + + if (sl.Count() % 2) + { + myErrMsg = form("%s is not a list of pairs (odd number of items)", + config_item); + + // We *could* continue, but that just means the error will be harder + // to find, unless someone actually sees the error message. + return; + } + + // Parse each as in TemplateList::createFromString. + for (int i = 0; i < sl.Count(); i += 2) + { + String url_regex = sl[i]; + String adjust_formula = sl[i+1]; + + ScoreAdjustItem *adjust_item + = new ScoreAdjustItem(url_regex, adjust_formula); + + if (adjust_item->ErrMsg().length() != 0) + { + // No point in continuing beyond the error; we might just + // overwrite the first error. + myErrMsg = form("While parsing %s: %s", + config_item, + adjust_item->ErrMsg().get()); + return; + } + + myAdjustmentList->Add(adjust_item); + } +} + +URLSeedScore::~URLSeedScore() +{ + delete myAdjustmentList; +} + +double +URLSeedScore::noninline_adjust_score(double orig_score, const String &url) +{ + List *adjlist = myAdjustmentList; + ScoreAdjustItem *adjust_item; + + adjlist->Start_Get(); + + while ((adjust_item = (ScoreAdjustItem *) adjlist->Get_Next())) + { + // Use the first match only. + if (adjust_item->Match(url)) + return adjust_item->adjust_score(orig_score); + } + + // We'll get here if no match was found. + return orig_score; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.h b/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.h new file mode 100644 index 00000000..49f8e64e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/HtURLSeedScore.h @@ -0,0 +1,55 @@ +// +// HtURLSeedScore.h +// +// URLSeedScore: Constructed from a Configuration, see doc +// for format of config item "url_seed_score". +// Method "double adjust_score(double score, const String &url)" +// returns an adjusted score, given the original score, or returns the +// original score if there was no adjustment to do. +// +// $Id: HtURLSeedScore.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +#ifndef __HtURLSeedScore_h +#define __HtURLSeedScore_h + +#include "Configuration.h" +#include "List.h" + +class URLSeedScore +{ +public: + URLSeedScore(Configuration &); + ~URLSeedScore(); + + // Return the "adjusted" score. Use an inline method to avoid + // function-call overhead when this feature is unused. + double adjust_score(double score, const String& url) + { + return myAdjustmentList->Count() == 0 + ? score : noninline_adjust_score(score, url); + } + + // If an error was discovered during the parsing of + // the configuration, this member gives a + // nonempty String with an error message. + const String& ErrMsg() { return myErrMsg; } + +private: + double noninline_adjust_score(double score, const String& url); + + // These member functions are not supposed to be implemented. + URLSeedScore(); + URLSeedScore(const URLSeedScore &); + void operator= (const URLSeedScore &); + + List *myAdjustmentList; + String myErrMsg; +}; + +#endif /* __HtURLSeedScore_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.am b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.am new file mode 100644 index 00000000..520c37a6 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.am @@ -0,0 +1,35 @@ + +include $(top_srcdir)/Makefile.config + +bindir = $(CGIBIN_DIR) + +LOCAL_DEFINES = -DCONFIG_DIR=\"$(CONFIG_DIR)\" -I$(top_srcdir)/htfuzzy + +bin_PROGRAMS = htsearch qtest + +htsearch_SOURCES = Display.cc DocMatch.cc ResultList.cc ResultMatch.cc \ + Template.cc TemplateList.cc WeightWord.cc htsearch.cc \ + parser.cc Collection.cc SplitMatches.cc HtURLSeedScore.cc +noinst_HEADERS = Display.h DocMatch.h ResultList.h ResultMatch.h \ + Template.h TemplateList.h WeightWord.h htsearch.h parser.h \ + Collection.h SplitMatches.h HtURLSeedScore.h \ + WordSearcher.h AndQuery.h AndQueryParser.h BooleanLexer.h \ + BooleanQueryParser.h ExactWordQuery.h FuzzyExpander.h GParser.h \ + NearQuery.h NotQuery.h OperatorQuery.h OrFuzzyExpander.h \ + OrQuery.h OrQueryParser.h PhraseQuery.h Query.h QueryCache.h \ + QueryLexer.h QueryParser.h SimpleLexer.h SimpleQueryParser.h \ + VolatileCache.h +htsearch_DEPENDENCIES = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) +htsearch_LDFLAGS = $(PROFILING) ${extra_ldflags} +htsearch_LDADD = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) + +qtest_SOURCES = DocMatch.cc ResultList.cc AndQuery.cc \ + BooleanLexer.cc BooleanQueryParser.cc ExactWordQuery.cc \ + GParser.cc NearQuery.cc NotQuery.cc OperatorQuery.cc \ + OrFuzzyExpander.cc OrQuery.cc PhraseQuery.cc Query.cc \ + QueryLexer.cc QueryParser.cc SimpleQueryParser.cc VolatileCache.cc \ + WordSearcher.cc qtest.cc +qtest_DEPENDENCIES = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) +qtest_LDFLAGS = $(PROFILING) ${extra_ldflags} +qtest_LDADD = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.in b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.in new file mode 100644 index 00000000..c83b8e2c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.in @@ -0,0 +1,519 @@ +# Makefile.in generated by automake 1.7.9 from Makefile.am. +# @configure_input@ + +# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003 +# Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# +# To compile with profiling do the following: +# +# make CFLAGS=-g CXXFLAGS=-g PROFILING=-p all +# + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +top_builddir = .. + +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +INSTALL = @INSTALL@ +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_triplet = @host@ +ACLOCAL = @ACLOCAL@ +ALLOCA = @ALLOCA@ +AMDEP_FALSE = @AMDEP_FALSE@ +AMDEP_TRUE = @AMDEP_TRUE@ +AMTAR = @AMTAR@ +APACHE = @APACHE@ +APACHE_MODULES = @APACHE_MODULES@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CGIBIN_DIR = @CGIBIN_DIR@ +COMMON_DIR = @COMMON_DIR@ +CONFIG_DIR = @CONFIG_DIR@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATABASE_DIR = @DATABASE_DIR@ +DEFAULT_CONFIG_FILE = @DEFAULT_CONFIG_FILE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO = @ECHO@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +F77 = @F77@ +FFLAGS = @FFLAGS@ +FIND = @FIND@ +GUNZIP = @GUNZIP@ +HAVE_SSL = @HAVE_SSL@ +HTDIG_MAJOR_VERSION = @HTDIG_MAJOR_VERSION@ +HTDIG_MICRO_VERSION = @HTDIG_MICRO_VERSION@ +HTDIG_MINOR_VERSION = @HTDIG_MINOR_VERSION@ +IMAGE_DIR = @IMAGE_DIR@ +IMAGE_URL_PREFIX = @IMAGE_URL_PREFIX@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LEX = @LEX@ +LEXLIB = @LEXLIB@ +LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@ +MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@ +MAKEINFO = @MAKEINFO@ +MV = @MV@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PERL = @PERL@ +RANLIB = @RANLIB@ +RRDTOOL = @RRDTOOL@ +SEARCH_DIR = @SEARCH_DIR@ +SEARCH_FORM = @SEARCH_FORM@ +SED = @SED@ +SENDMAIL = @SENDMAIL@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +TAR = @TAR@ +TESTS_FALSE = @TESTS_FALSE@ +TESTS_TRUE = @TESTS_TRUE@ +TIME = @TIME@ +TIMEV = @TIMEV@ +USER = @USER@ +VERSION = @VERSION@ +YACC = @YACC@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_F77 = @ac_ct_F77@ +ac_ct_RANLIB = @ac_ct_RANLIB@ +ac_ct_STRIP = @ac_ct_STRIP@ +am__fastdepCC_FALSE = @am__fastdepCC_FALSE@ +am__fastdepCC_TRUE = @am__fastdepCC_TRUE@ +am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@ +am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ + +bindir = $(CGIBIN_DIR) +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +datadir = @datadir@ +exec_prefix = @exec_prefix@ +extra_ldflags = @extra_ldflags@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +oldincludedir = @oldincludedir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ + +AUTOMAKE_OPTIONS = foreign no-dependencies + +INCLUDES = -DDEFAULT_CONFIG_FILE=\"$(DEFAULT_CONFIG_FILE)\" \ + -I$(top_srcdir)/include -I$(top_srcdir)/htlib \ + -I$(top_srcdir)/htnet -I$(top_srcdir)/htcommon \ + -I$(top_srcdir)/htword \ + -I$(top_srcdir)/db -I$(top_builddir)/db \ + $(LOCAL_DEFINES) $(PROFILING) + + +HTLIBS = $(top_builddir)/htnet/libhtnet.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/htlib/libht.la \ + $(top_builddir)/htcommon/libcommon.la \ + $(top_builddir)/htword/libhtword.la \ + $(top_builddir)/db/libhtdb.la \ + $(top_builddir)/htlib/libht.la + + +LOCAL_DEFINES = -DCONFIG_DIR=\"$(CONFIG_DIR)\" -I$(top_srcdir)/htfuzzy + +bin_PROGRAMS = htsearch qtest + +htsearch_SOURCES = Display.cc DocMatch.cc ResultList.cc ResultMatch.cc \ + Template.cc TemplateList.cc WeightWord.cc htsearch.cc \ + parser.cc Collection.cc SplitMatches.cc HtURLSeedScore.cc + +noinst_HEADERS = Display.h DocMatch.h ResultList.h ResultMatch.h \ + Template.h TemplateList.h WeightWord.h htsearch.h parser.h \ + Collection.h SplitMatches.h HtURLSeedScore.h \ + WordSearcher.h AndQuery.h AndQueryParser.h BooleanLexer.h \ + BooleanQueryParser.h ExactWordQuery.h FuzzyExpander.h GParser.h \ + NearQuery.h NotQuery.h OperatorQuery.h OrFuzzyExpander.h \ + OrQuery.h OrQueryParser.h PhraseQuery.h Query.h QueryCache.h \ + QueryLexer.h QueryParser.h SimpleLexer.h SimpleQueryParser.h \ + VolatileCache.h + +htsearch_DEPENDENCIES = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) +htsearch_LDFLAGS = $(PROFILING) ${extra_ldflags} +htsearch_LDADD = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) + +qtest_SOURCES = DocMatch.cc ResultList.cc AndQuery.cc \ + BooleanLexer.cc BooleanQueryParser.cc ExactWordQuery.cc \ + GParser.cc NearQuery.cc NotQuery.cc OperatorQuery.cc \ + OrFuzzyExpander.cc OrQuery.cc PhraseQuery.cc Query.cc \ + QueryLexer.cc QueryParser.cc SimpleQueryParser.cc VolatileCache.cc \ + WordSearcher.cc qtest.cc + +qtest_DEPENDENCIES = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) +qtest_LDFLAGS = $(PROFILING) ${extra_ldflags} +qtest_LDADD = $(top_builddir)/htfuzzy/libfuzzy.la $(HTLIBS) +subdir = htsearch +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs +CONFIG_HEADER = $(top_builddir)/include/config.h +CONFIG_CLEAN_FILES = +bin_PROGRAMS = htsearch$(EXEEXT) qtest$(EXEEXT) +PROGRAMS = $(bin_PROGRAMS) + +am_htsearch_OBJECTS = Display.$(OBJEXT) DocMatch.$(OBJEXT) \ + ResultList.$(OBJEXT) ResultMatch.$(OBJEXT) Template.$(OBJEXT) \ + TemplateList.$(OBJEXT) WeightWord.$(OBJEXT) htsearch.$(OBJEXT) \ + parser.$(OBJEXT) Collection.$(OBJEXT) SplitMatches.$(OBJEXT) \ + HtURLSeedScore.$(OBJEXT) +htsearch_OBJECTS = $(am_htsearch_OBJECTS) +am_qtest_OBJECTS = DocMatch.$(OBJEXT) ResultList.$(OBJEXT) \ + AndQuery.$(OBJEXT) BooleanLexer.$(OBJEXT) \ + BooleanQueryParser.$(OBJEXT) ExactWordQuery.$(OBJEXT) \ + GParser.$(OBJEXT) NearQuery.$(OBJEXT) NotQuery.$(OBJEXT) \ + OperatorQuery.$(OBJEXT) OrFuzzyExpander.$(OBJEXT) \ + OrQuery.$(OBJEXT) PhraseQuery.$(OBJEXT) Query.$(OBJEXT) \ + QueryLexer.$(OBJEXT) QueryParser.$(OBJEXT) \ + SimpleQueryParser.$(OBJEXT) VolatileCache.$(OBJEXT) \ + WordSearcher.$(OBJEXT) qtest.$(OBJEXT) +qtest_OBJECTS = $(am_qtest_OBJECTS) + +DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)/include +depcomp = +am__depfiles_maybe = +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) --mode=link $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +DIST_SOURCES = $(htsearch_SOURCES) $(qtest_SOURCES) +HEADERS = $(noinst_HEADERS) + +DIST_COMMON = $(noinst_HEADERS) $(srcdir)/Makefile.in \ + $(top_srcdir)/Makefile.config Makefile.am +SOURCES = $(htsearch_SOURCES) $(qtest_SOURCES) + +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/Makefile.config $(top_srcdir)/configure.in $(ACLOCAL_M4) + cd $(top_srcdir) && \ + $(AUTOMAKE) --foreign htsearch/Makefile +Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in $(top_builddir)/config.status + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe) +binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) +install-binPROGRAMS: $(bin_PROGRAMS) + @$(NORMAL_INSTALL) + $(mkinstalldirs) $(DESTDIR)$(bindir) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + if test -f $$p \ + || test -f $$p1 \ + ; then \ + f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f"; \ + $(INSTALL_PROGRAM_ENV) $(LIBTOOL) --mode=install $(binPROGRAMS_INSTALL) $$p $(DESTDIR)$(bindir)/$$f || exit 1; \ + else :; fi; \ + done + +uninstall-binPROGRAMS: + @$(NORMAL_UNINSTALL) + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \ + echo " rm -f $(DESTDIR)$(bindir)/$$f"; \ + rm -f $(DESTDIR)$(bindir)/$$f; \ + done + +clean-binPROGRAMS: + @list='$(bin_PROGRAMS)'; for p in $$list; do \ + f=`echo $$p|sed 's/$(EXEEXT)$$//'`; \ + echo " rm -f $$p $$f"; \ + rm -f $$p $$f ; \ + done +htsearch$(EXEEXT): $(htsearch_OBJECTS) $(htsearch_DEPENDENCIES) + @rm -f htsearch$(EXEEXT) + $(CXXLINK) $(htsearch_LDFLAGS) $(htsearch_OBJECTS) $(htsearch_LDADD) $(LIBS) +qtest$(EXEEXT): $(qtest_OBJECTS) $(qtest_DEPENDENCIES) + @rm -f qtest$(EXEEXT) + $(CXXLINK) $(qtest_LDFLAGS) $(qtest_OBJECTS) $(qtest_LDADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) core *.core + +distclean-compile: + -rm -f *.tab.c + +.cc.o: + $(CXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +.cc.obj: + $(CXXCOMPILE) -c -o $@ `if test -f '$<'; then $(CYGPATH_W) '$<'; else $(CYGPATH_W) '$(srcdir)/$<'; fi` + +.cc.lo: + $(LTCXXCOMPILE) -c -o $@ `test -f '$<' || echo '$(srcdir)/'`$< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +distclean-libtool: + -rm -f libtool +uninstall-info-am: + +ETAGS = etags +ETAGSFLAGS = + +CTAGS = ctags +CTAGSFLAGS = + +tags: TAGS + +ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES) + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + mkid -fID $$unique + +TAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(ETAGS_ARGS)$$tags$$unique" \ + || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$tags $$unique + +ctags: CTAGS +CTAGS: $(HEADERS) $(SOURCES) $(TAGS_DEPENDENCIES) \ + $(TAGS_FILES) $(LISP) + tags=; \ + here=`pwd`; \ + list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | \ + $(AWK) ' { files[$$0] = 1; } \ + END { for (i in files) print i; }'`; \ + test -z "$(CTAGS_ARGS)$$tags$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$tags $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && cd $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) $$here + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) + +top_distdir = .. +distdir = $(top_distdir)/$(PACKAGE)-$(VERSION) + +distdir: $(DISTFILES) + $(mkinstalldirs) $(distdir)/.. + @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \ + list='$(DISTFILES)'; for file in $$list; do \ + case $$file in \ + $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \ + $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \ + esac; \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test "$$dir" != "$$file" && test "$$dir" != "."; then \ + dir="/$$dir"; \ + $(mkinstalldirs) "$(distdir)$$dir"; \ + else \ + dir=''; \ + fi; \ + if test -d $$d/$$file; then \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(PROGRAMS) $(HEADERS) + +installdirs: + $(mkinstalldirs) $(DESTDIR)$(bindir) +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-binPROGRAMS clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-libtool distclean-tags + +dvi: dvi-am + +dvi-am: + +info: info-am + +info-am: + +install-data-am: + +install-exec-am: install-binPROGRAMS + +install-info: install-info-am + +install-man: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: uninstall-binPROGRAMS uninstall-info-am + +.PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \ + clean-generic clean-libtool ctags distclean distclean-compile \ + distclean-generic distclean-libtool distclean-tags distdir dvi \ + dvi-am info info-am install install-am install-binPROGRAMS \ + install-data install-data-am install-exec install-exec-am \ + install-info install-info-am install-man install-strip \ + installcheck installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags uninstall uninstall-am uninstall-binPROGRAMS \ + uninstall-info-am + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.win32 b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.win32 new file mode 100644 index 00000000..dfcc9edf --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Makefile.win32 @@ -0,0 +1,30 @@ +# +# Makefile - makefile for htsearch +# + +APP_NAME = Right Now Web CGI +RNT_PRODUCT = rnw + +TARGET = $(BINDIR)/htsearch$(EXESFX) + +include ../Makedefs.win32 + +# ----------------------------------------------------------------------------- +# add new executable members to this list + + +CXXSRC = Display.cc DocMatch.cc ResultList.cc ResultMatch.cc \ + Template.cc TemplateList.cc WeightWord.cc htsearch.cc parser.cc \ + Collection.cc SplitMatches.cc HtURLSeedScore.cc + +CPPFLAGS += -DHAVE_CONFIG_H -I. -I../include -I../htlib -I../htcommon -I../htword -I../db -I../htnet + +LDLIBS = ../lib/$(ARCH)/libhtdb.lib ../lib/$(ARCH)/libcommon.lib ../lib/$(ARCH)/libht.lib ../lib/$(ARCH)/libhtword.lib ../lib/$(ARCH)/libhtnet.lib ../lib/$(ARCH)/libfuzzy.lib +OTHERLIBS = ws2_32.lib L:/win32/lib/zlib114/zlib.lib + +DEPLIBS += $(LDLIBS) + +$(TARGET): $(OBJDIRDEP) $(BINDIRDEP) $(OBJS) $(DEPLIBS) + $(EXELD) $(LDFLAGS) $(OBJS) $(LDLIBS) $(OTHERLIBS) + +include ../Makerules.win32 diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.cc new file mode 100644 index 00000000..52487fdc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.cc @@ -0,0 +1,143 @@ +// +// NearQuery.cc +// +// NearQuery: An operator query that filters matches by proximity. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: NearQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "NearQuery.h" + +String +NearQuery::OperatorString() const +{ + String s; + s << "near/" << distance; + return s; +} + +// +// l r nextTo +// ----------------------- +// 0 0 0 +// 0 b 0 +// 0 x 0 +// a 0 0 +// a b near(a, b) +// a x a +// x 0 0 +// x b b +// x x x +// +ResultList * +NearQuery::Evaluate() +{ + ResultList *result = 0; + Query *left = (Query *)operands[0]; + Query *right = (Query *)operands[1]; + + if(left && right) + { + ResultList *l = left->GetResults(); + if(l) + { + ResultList *r = right->GetResults(); + if(r) + { + if(l->IsIgnore()) + { + result = new ResultList(*r); + } + else if(r->IsIgnore()) + { + result = new ResultList(*l); + } + else + { + result = Near(*l, *r); + } + } + } + } + return result; +} + +ResultList * +NearQuery::Near(const ResultList &l, const ResultList &r) +{ + ResultList *result = 0; + DictionaryCursor c; + l.Start_Get(c); + DocMatch *match = (DocMatch *)l.Get_NextElement(c); + while(match) + { + DocMatch *confirm = r.find(match->GetId()); + if(confirm) + { + List *locations = MergeLocations( + *match->GetLocations(), + *confirm->GetLocations()); + if(locations) + { + if(!result) + { + result = new ResultList; + } + DocMatch *copy = new DocMatch(*match); + copy->SetLocations(locations); + result->add(copy); + } + } + match = (DocMatch *)l.Get_NextElement(c); + } + return result; +} + +// +//: merge match positions in a 'near' operation +// all combinations are tested; the pairs of positions near enough are kept +// +List * +NearQuery::MergeLocations(const List &p, const List &q) +{ + List *result = 0; + ListCursor pc; + p.Start_Get(pc); + const Location *left = (const Location *)p.Get_Next(pc); + while(left) + { + ListCursor qc; + q.Start_Get(qc); + const Location *right = (const Location *)q.Get_Next(qc); + while(right) + { + int dist = right->from - left->to; + if(dist < 1) + { + dist = left->from - right->to; + if(dist < 1) + { + dist = 0; + } + } + if(unsigned(dist) <= distance) + { + if(!result) + { + result = new List; + } + result->Add(new Location(*left)); + result->Add(new Location(*right)); + } + right = (const Location *)q.Get_Next(qc); + } + left = (const Location *)p.Get_Next(pc); + } + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.h new file mode 100644 index 00000000..77de762b --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/NearQuery.h @@ -0,0 +1,50 @@ +#ifndef _NearQuery_h_ +#define _NearQuery_h_ + +// +// NearQuery.h +// +// NearQuery: An operator query that filters matches by proximity. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: NearQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "OperatorQuery.h" + +class NearQuery : public OperatorQuery +{ +public: + // binary fashion + NearQuery(Query *left, Query *right, unsigned int dist) : + distance(dist) + { Add(left); Add(right); } + + // n-ary fashion -- will ignore operands for n>2 + NearQuery(unsigned int dist = 10) : + distance(dist) {} + +private: + // get results from operands and filter + ResultList *Evaluate(); + + // create a result with neighboring matches + ResultList *Near(const ResultList &, const ResultList &); + + // merge neighboring location lists + List *MergeLocations(const List &, const List &); + + String OperatorString() const; + unsigned int distance; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.cc new file mode 100644 index 00000000..11a55c70 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.cc @@ -0,0 +1,110 @@ +// +// NotQuery.cc +// +// NotQuery: 'not' query operator (n-ary not!) +// i.e. not(a, b, c, d...) == a except (b or c or d or...) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: NotQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + + +#include "NotQuery.h" +// +// l r not +// ------------------------- +// 0 0 0 +// 0 b 0 +// 0 x 0 +// a 0 a +// a b diff(a,b) +// a x a +// x 0 x +// x b x +// x x x +// +// note l is the first operand, r is the rest +// i.e. l = 0 => not = 0 +// l = x => not = x +// r = 0 => not = l +// r = x => not = l +// subtract otherwise +// +ResultList * +NotQuery::Evaluate() +{ + operands.Start_Get(); + Query *operand = (Query *) operands.Get_Next(); + ResultList *result = 0; + ResultList *positive = operand->GetResults(); + if(positive) + { + List negative; + if(!positive->IsIgnore()) + { + operand = (Query *) operands.Get_Next(); + while(operand) + { + ResultList *next = operand->GetResults(); + if(next && !next->IsIgnore()) + { + negative.Add(next); + } + operand = (Query *) operands.Get_Next(); + } + } + if(negative.Count()) + { + result = Subtract(*positive, negative); + negative.Release(); + } + else + { + result = new ResultList(*positive); + } + } + return result; +} + +// +// make a result list containing all matches in positive +// with docId absent from negatives +// +ResultList * +NotQuery::Subtract(const ResultList &positive, const List &negatives) +{ + ResultList *result = 0; + DictionaryCursor pc; + positive.Start_Get(pc); + DocMatch *match = (DocMatch *)positive.Get_NextElement(pc); + while(match) + { + bool confirm = true; + ListCursor lc; + negatives.Start_Get(lc); + ResultList *negative = (ResultList *)negatives.Get_Next(lc); + while(confirm && negative) + { + if(negative->exists(match->GetId())) + { + confirm = false; + } + negative = (ResultList *)negatives.Get_Next(lc); + } + if(confirm) + { + if(!result) + { + result = new ResultList; + } + result->add(new DocMatch(*match)); + } + match = (DocMatch *)positive.Get_NextElement(pc); + } + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.h new file mode 100644 index 00000000..0585d7ad --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/NotQuery.h @@ -0,0 +1,42 @@ +#ifndef _NotQuery_h_ +#define _NotQuery_h_ + +// +// NotQuery.h +// +// NotQuery: 'not' query operator (n-ary not!) +// i.e. not(a, b, c, d...) == a except (b or c or d or...) +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: NotQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "OperatorQuery.h" + +// +// +class NotQuery : public OperatorQuery +{ +public: + +private: + // evaluate operands and operate + ResultList *Evaluate(); + + // create a difference of the operand results + ResultList *Subtract(const ResultList &, const List &); + + // used by GetLogicalWords + String OperatorString() const { return String("not"); } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.cc new file mode 100644 index 00000000..ebb7ba58 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.cc @@ -0,0 +1,49 @@ +// +// OperatorQuery.cc +// +// OperatorQuery: (abstract class) a query that combines result lists +// returned by other queries kept in an operand list. +// how they are combined is tbd by the concrete classes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OperatorQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "OperatorQuery.h" +// +// return a string with the query as a boolean expression +// descends recursively over the operand +// +String +OperatorQuery::GetLogicalWords() const +{ + ListCursor c; + String out; + out << "("; + if(operands.Count()) + { + operands.Start_Get(c); + out << ((Query *) operands.Get_Next(c))->GetLogicalWords(); + Query *next = (Query *) operands.Get_Next(c); + while(next) + { + out << " " << OperatorString() << " "; + if(next) + { + out << next->GetLogicalWords(); + } + else + { + out << "*nothing*"; + } + next = (Query *) operands.Get_Next(c); + } + } + out << ")"; + return out; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.h new file mode 100644 index 00000000..5c612ccf --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OperatorQuery.h @@ -0,0 +1,68 @@ +#ifndef _OperatorQuery_h_ +#define _OperatorQuery_h_ + +// +// OperatorQuery.h +// +// OperatorQuery: (abstract class) a query that combines result lists +// returned by other queries kept in an operand list. +// how they are combined is tbd by the concrete classes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OperatorQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +// +// for details about the basic architectural pattern see the book: +// Design Patterns, by the infamous GoF +// Interpreter pattern +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "Query.h" +#include "List.h" + +// abstract +class OperatorQuery : public Query +{ +public: + virtual ~OperatorQuery() + { + operands.Destroy(); + } + + // add an operand to the operation + void Add(Query *operand) + { + operands.Add(operand); + } + +protected: + OperatorQuery() {} + + // get results from operands and combine them ad-hoc + virtual ResultList *Evaluate() = 0; + + // keyword name of the operation + virtual String OperatorString() const = 0; + + // human-readable unparsed string + virtual String GetLogicalWords() const; + + // cache index + String GetSignature() const + { return String("Compound:")+GetLogicalWords(); } + + // children query operands + List operands; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.cc b/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.cc new file mode 100644 index 00000000..d288496d --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.cc @@ -0,0 +1,94 @@ +// +// OrFuzzyExpander.cc +// +// OrFuzzyExpander: a concrete Fuzzy expander that makes a OR with +// all the results returned by the applicable Fuzzies. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OrFuzzyExpander.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "OrFuzzyExpander.h" +#include "Dictionary.h" +#include "ExactWordQuery.h" +#include "OrQuery.h" + +extern int debug; + +// +// creates a query with a OrQuery with all the +// distinct fuzzy results +// +// additionally, sets fuzzy scores for used words +// +Query * +OrFuzzyExpander::MakeQuery(const String &word) +{ + Query *result = 0; + Dictionary exacts; + + // for each configured fuzzy + filters.Start_Get(); + Fuzzy *fuzzy = (Fuzzy *)filters.Get_Next(); + while(fuzzy) + { + // for each word expanded by fuzzy + List words; + String nonconst = word; + fuzzy->getWords(nonconst, words); + words.Start_Get(); + String *w = (String *)words.Get_Next(); + while(w) + { + // if not yet expanded by another fuzzy + // add it to the big Or + if(debug) cerr << "fuzzy " << word << "=" << *w << endl; + ExactWordQuery *exact = (ExactWordQuery *)exacts[*w]; + if(!exact) + { + exact = new ExactWordQuery(*w); + exact->SetWeight(fuzzy->getWeight()); + exacts.Add(*w, exact); + } + // otherwise, just adjust the weight + else + { + exact->SetWeight( + exact->GetWeight() + + fuzzy->getWeight()); + } + w = (String *)words.Get_Next(); + } + fuzzy = (Fuzzy *)filters.Get_Next(); + } + + // return the expanded query + // a single word or + // a Or with all the expanded words + exacts.Start_Get(); + Query *exact = (Query *)exacts.Get_NextElement(); + if(exact) + { + result = exact; + exact = (Query *)exacts.Get_NextElement(); + } + if(exact) + { + Query *tmp = result; + result = new OrQuery; + result->Add(tmp); + while(exact) + { + result->Add(exact); + exact = (Query *)exacts.Get_NextElement(); + } + } + exacts.Release(); + + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.h b/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.h new file mode 100644 index 00000000..4287f261 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OrFuzzyExpander.h @@ -0,0 +1,49 @@ +#ifndef _OrFuzzyExpander_h_ +#define _OrFuzzyExpander_h_ + +// +// OrFuzzyExpander.h +// +// OrFuzzyExpander: a concrete Fuzzy expander that makes a OR with +// all the results returned by the applicable Fuzzies. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OrFuzzyExpander.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "FuzzyExpander.h" +#include "List.h" +#include "Fuzzy.h" + +// +// makes a Or query with all the fuzzy expansions +// +class Fuzzy; +class OrFuzzyExpander : public FuzzyExpander +{ +public: + OrFuzzyExpander() {} + virtual ~OrFuzzyExpander() { filters.Release(); } + + // use this filter + void Add(Fuzzy *filter) { filters.Add(filter); } + + +private: + // generate a OrQuery with all fuzzies found + Query *MakeQuery(const String &word); + + // Fuzzies to be used + List filters; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.cc new file mode 100644 index 00000000..facbd9b0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.cc @@ -0,0 +1,126 @@ +// +// OrQuery.cc +// +// OrQuery: an operator query that merges all the results of its operands +// i.e. does 'or' combination +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OrQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + + +#include "OrQuery.h" +// +// return a ResultList containing an Or of the results of the operands +// evaluate all operands to do so +// +// l r or +// --------------------- +// 0 0 0 +// 0 b b +// 0 x x +// a 0 a +// a b union(a,b) +// a x a +// x 0 x +// x b b +// x x x +// +// i.e. nulls and ignored are left out union +// +// Note that all operands are evaluated +// Ignored operands are not included in the operation +// the longer input result list is passed separately to Union +// + +ResultList * +OrQuery::Evaluate() +{ + ResultList *result = 0; + ResultList *longer = 0; + List shorter; + int ignores = 0; + operands.Start_Get(); + Query *operand = (Query *) operands.Get_Next(); + while(operand) + { + ResultList *next = operand->GetResults(); + if(next) + { + if(!next->IsIgnore()) + { + if(!longer || longer->Count() < next->Count()) + { + if(longer) + { + shorter.Add(longer); + } + longer = next; + } + else + { + shorter.Add(next); + } + } + else + { + ignores++; + } + } + operand = (Query *) operands.Get_Next(); + } + if(longer) + { + result = Union(*longer, shorter); + shorter.Release(); + } + else if(ignores == operands.Count()) + { + result = new ResultList; + result->Ignore(); + } + return result; +} + +// +// copy unique DocMatches to the resulting list +// matches with the same docId are merged +// the longer list is assumed to be the first parameter +// this is a modest optimisation +// +ResultList * +OrQuery::Union(const ResultList &longer, const List &lists) +{ + ResultList *result = new ResultList(longer); + + ListCursor lc; + lists.Start_Get(lc); + ResultList *current = (ResultList *) lists.Get_Next(lc); + while(current) + { + DictionaryCursor c; + current->Start_Get(c); + DocMatch *match = (DocMatch *) current->Get_NextElement(c); + while(match) + { + DocMatch *previous = result->find(match->GetId()); + if(previous) + { + previous->Merge(*match); + } + else + { + DocMatch *copy = new DocMatch(*match); + result->add(copy); + } + match = (DocMatch *) current->Get_NextElement(c); + } + current = (ResultList *) lists.Get_Next(lc); + } + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.h new file mode 100644 index 00000000..c7f2c09c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OrQuery.h @@ -0,0 +1,39 @@ +#ifndef _OrQuery_h_ +#define _OrQuery_h_ + +// +// OrQuery.h +// +// OrQuery: an operator query that merges all the results of its operands +// i.e. does 'or' combination +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OrQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "OperatorQuery.h" + +class OrQuery : public OperatorQuery +{ +public: + +private: + // evaluate operands and join results + ResultList *Evaluate(); + + // create a union of the operand results + ResultList *Union(const ResultList &longer, const List &shorter); + + String OperatorString() const { return String("or"); } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/OrQueryParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/OrQueryParser.h new file mode 100644 index 00000000..ec6a3337 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/OrQueryParser.h @@ -0,0 +1,33 @@ +#ifndef _OrQueryParser_h_ +#define _OrQueryParser_h_ + +// +// OrQueryParser.h +// +// OrQueryParser: a query parser for 'any word' (or) queries +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: OrQueryParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "SimpleQueryParser.h" +#include "OrQuery.h" + +class OrQueryParser : public SimpleQueryParser +{ +public: + OrQueryParser() {} + +private: + OperatorQuery *MakeQuery() + { + return new OrQuery; + } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.cc b/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.cc new file mode 100644 index 00000000..a42d97b3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.cc @@ -0,0 +1,175 @@ +// +// PhraseQuery.cc +// +// PhraseQuery: an operator query that filters sequenced word matches +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: PhraseQuery.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "PhraseQuery.h" + +// +// evaluate operands and make a result with matches if some. +// +ResultList * +PhraseQuery::Evaluate() +{ + ResultList *result = 0; + + operands.Start_Get(); + Query *next = (Query *)operands.Get_Next(); + if(next) + { + result = (ResultList *)next->GetResults(); + next = (Query *)operands.Get_Next(); + } + if(result) + { + result = new ResultList(*result); + } + while(result && next) + { + ResultList *r = next->GetResults(); + if(r) + { + if(result->IsIgnore()) + { + delete result; + result = new ResultList(*r); + } + else if(!r->IsIgnore()) + { + ResultList *tmp = result; + result = Near(*tmp, *r); + delete tmp; + } + next = (Query *)operands.Get_Next(); + } + else + { + delete result; + result = 0; + } + } + return result; +} + +String +PhraseQuery::GetLogicalWords() const +{ + ListCursor c; + String out; + out << "\""; + if(operands.Count()) + { + operands.Start_Get(c); + out << ((Query *) operands.Get_Next(c))->GetLogicalWords(); + Query *next = (Query *) operands.Get_Next(c); + while(next) + { + out << " "; + if(next) + { + out << next->GetLogicalWords(); + } + else + { + out << "*nothing*"; + } + next = (Query *) operands.Get_Next(c); + } + } + out << "\""; + return out; +} + +// +// return a resultlist containing matches that are contiguous +// +ResultList * +PhraseQuery::Near(const ResultList &l, const ResultList &r) +{ + ResultList *result = 0; + DictionaryCursor c; + l.Start_Get(c); + DocMatch *match = (DocMatch *)l.Get_NextElement(c); + while(match) + { + DocMatch *confirm = r.find(match->GetId()); + if(confirm) + { + List *locations = MergeLocations( + *match->GetLocations(), + *confirm->GetLocations()); + if(locations) + { + if(!result) + { + result = new ResultList; + } + DocMatch *copy = new DocMatch(*match); + copy->SetLocations(locations); + result->add(copy); + } + } + match = (DocMatch *)l.Get_NextElement(c); + } + return result; +} + + +// +//: merge match positions in a 'next' operation +// each position of left operand match is tested against right operand positions +// if two contiguous positions are found, they are merged into a single one +// beginning at the begin of the left operand +// and ending and the end of the right operand +// +List * +PhraseQuery::MergeLocations(const List &p, const List &q) +{ + List *result = 0; + ListCursor pc; + p.Start_Get(pc); + const Location *left = (const Location *)p.Get_Next(pc); + while(left) + { + ListCursor qc; + q.Start_Get(qc); + const Location *right = (const Location *)q.Get_Next(qc); + while(right) + { + if(left->to + 1 == right->from) + { + double prevsize = left->to - left->from + 1.0; + double addsize = right->to - right->from + 1.0; + double weight = + ((left->weight * prevsize) + + (right->weight * addsize)) / + (right->to - left->from + 1.0); + + if(!result) + { + result = new List; + } + + result->Add(new Location( + left->from, + right->to, + left->flags & right->flags, + weight)); + break; + } + right = (const Location *)q.Get_Next(qc); + } + left = (const Location *)p.Get_Next(pc); + } + return result; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.h b/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.h new file mode 100644 index 00000000..c93ddeaa --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/PhraseQuery.h @@ -0,0 +1,45 @@ +#ifndef _PhraseQuery_h_ +#define _PhraseQuery_h_ + +// +// PhraseQuery.h +// +// PhraseQuery: an operator query that filters sequenced word matches +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: PhraseQuery.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "OperatorQuery.h" + +class PhraseQuery : public OperatorQuery +{ +public: + PhraseQuery() {} + ~PhraseQuery() {} + +private: + // get results from operands and filter + ResultList *Evaluate(); + + // create a result with neighboring matches + ResultList *Near(const ResultList &, const ResultList &); + + // merge neighboring location lists, constructing phrase locations + List *MergeLocations(const List &, const List &); + + String OperatorString() const { return ""; } + + String GetLogicalWords() const; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Query.cc b/debian/htdig/htdig-3.2.0b6/htsearch/Query.cc new file mode 100644 index 00000000..0be01033 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Query.cc @@ -0,0 +1,89 @@ +// +// Query.cc +// +// Query: (abstract) a parsed, 'executable' digger database query +// a query tree is formed by leaf objects (ExactWordQuery) and +// node objects (OperatorQuery) derived from this class. +// Query execution results are returned as ResultList objects. +// Query evaluation is cached. Cache policy is delegated to the +// QueryCache class family. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Query.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "Query.h" +#include "VolatileCache.h" + +// +// the in-memory query result cache. the default instance is +// defined static so its destructor is called at program exit +// +VolatileCache theDefaultCache; + +QueryCache * +Query::cache = &theDefaultCache; + +extern int debug; + +// +// destructor +// +Query::~Query() +{ +} + +// +// return a ResultList with the query results +// results are initially fetched from the cache +// if not cached, the query is evaluated +// Weight of the results is adjusted at each invocation, as +// the same result list may be shared by different queries +// but different weights may be assigned to the word +// +// +ResultList * +Query::GetResults() +{ + ResultList *result = 0; + + // try to find in cache before trying eval + String signature; + if(cache) + { + signature = GetSignature(); + result = cache->Lookup(signature); + } + + // no cache or not in cache, evaluate + if(!result) + { + if(debug) cerr << "EVAL: " << signature << endl; + result = Evaluate(); + + if(cache) + { + cache->Add(signature, result); + } + } + + // adjust if something found/returned + if(result) + { + if(result->Count()) + { + AdjustWeight(*result); + } + else if(!result->IsIgnore()) + { + result = 0; + } + } + return result; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Query.h b/debian/htdig/htdig-3.2.0b6/htsearch/Query.h new file mode 100644 index 00000000..f6045ac8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Query.h @@ -0,0 +1,77 @@ +#ifndef __Query_h__ +#define __Query_h__ + +// +// Query.h +// +// Query: (abstract) a parsed, 'executable' digger database query +// a query tree is formed by leaf objects (ExactWordQuery) and +// node objects (OperatorQuery) derived from this class. +// Query execution results are returned as ResultList objects. +// Query evaluation is cached. Cache policy is delegated to the +// QueryCache class family. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Query.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +// +// for details about the basic architectural patterns see the book: +// Design Patterns, by the infamous GoF +// Interpreter pattern +// Factory pattern +// Flyweight pattern +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "Object.h" +#include "htString.h" +#include "ResultList.h" + +class QueryCache; + +// abstract +class Query : public Object +{ +public: + // destr + virtual ~Query(); + + // does nothing here -- hack for comfortable parser coding + virtual void Add(Query *) {} + + // get a boolean-style query string + virtual String GetLogicalWords() const = 0; + + // evaluate if necessary and return results + ResultList *GetResults(); + + // set a cache policy + static void SetCache(QueryCache *c) { cache = c; } + +protected: + // get an unique cache index + virtual String GetSignature() const = 0; + + Query() {} + + // generate results + virtual ResultList *Evaluate() = 0; + + // by default, nothing -- for use of leaf queries + virtual void AdjustWeight(ResultList &) {} + +private: + // the current cache object, if some + static QueryCache *cache; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/QueryCache.h b/debian/htdig/htdig-3.2.0b6/htsearch/QueryCache.h new file mode 100644 index 00000000..fd9f53fa --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/QueryCache.h @@ -0,0 +1,45 @@ +#ifndef _QueryCache_h_ +#define _QueryCache_h_ + +// +// QueryCache.h +// +// QueryCache: (abstract) interface for the current Query cache policy. +// A cache stores ResultLists indexed by a signature string. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: QueryCache.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif +#include "Object.h" +#include "htString.h" + +class ResultList; + +// abstract +class QueryCache : public Object +{ +public: + // destructor + virtual ~QueryCache() {} + + // get cached result for a query signature + virtual ResultList *Lookup(const String &signature) = 0; + + // add result to be cached for a query signature + virtual void Add(const String &signature, ResultList *entry) = 0; + +protected: + // construction + QueryCache() {} +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.cc b/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.cc new file mode 100644 index 00000000..ea57e3ce --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.cc @@ -0,0 +1,84 @@ +// +// QueryLexer.cc +// +// QueryLexer: (abstract) a lexical analyzer used by a QueryParser. +// This class defines the common public interface of this +// family of lexers. It implements a tokenizer, and also +// the definition of the 'quote' and 'end' terminal symbols. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: QueryLexer.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryLexer.h" +#include "defaults.h" +#include "WordType.h" + + +extern int debug; + +QueryLexer::QueryLexer() +{ + HtConfiguration* config= HtConfiguration::config(); + prefix_match = config->Find("prefix_match_character"); +} + +void +QueryLexer::Set(const String &query_string) +{ + query = query_string; + current_char = 0; + Next(); +} + +void +QueryLexer::Next() +{ + HtConfiguration* config= HtConfiguration::config(); + unsigned char text = query[current_char]; + WordType type(*config); + current = ""; + + while (text + && !current.length() + && !type.IsStrictChar(text)) + { + if (text == '(' || text == ')' || text == '\"' || text == '/') + { + current << text; + if (debug) cerr << "lexer symbol: " << current << endl; + } + text = query[++current_char]; + } + + if (!current.length() && text) + { + while (text + && (type.IsChar(text) && text != '/' + || prefix_match.indexOf(text, 0) != -1)) + { + current << text; + text = query[++current_char]; + } + } + current.lowercase(); + if (debug) cerr << "lexer current word: " << current << endl; +} + +bool +QueryLexer::IsEnd() const +{ + return current == String(""); +} + +bool +QueryLexer::IsQuote() const +{ + return current == String("\""); +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.h b/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.h new file mode 100644 index 00000000..bbf57734 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/QueryLexer.h @@ -0,0 +1,71 @@ +#ifndef _QueryLexer_h_ +#define _QueryLexer_h_ + +// +// QueryLexer.h +// +// QueryLexer: (abstract) a lexical analyzer used by a QueryParser. +// This class defines the common public interface of this +// family of lexers. It implements a tokenizer, and also +// the definition of the 'quote' and 'end' terminal symbols. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: QueryLexer.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "htString.h" + +class QueryLexer +{ +public: + virtual ~QueryLexer() {} + + // set the query string and advance to the first token + void Set(const String &query_string); + + // advance to the next token + virtual void Next(); + + // is the current token a word? + virtual bool IsWord() const = 0; + + // is the current token a quote sign? + bool IsQuote() const; + + // is the current token end-of-query? + bool IsEnd() const; + + // get the current token value + const String &Value() const { return current; } + + // get the full query string + const String &FullString() const { return query; } + + +protected: + QueryLexer(); + + // the full query string + String query; + + // the current token value + String current; + + // the current position in the query string + int current_char; + + // suffix string used by the 'prefix' fuzzy + String prefix_match; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.cc new file mode 100644 index 00000000..ad74b8ba --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.cc @@ -0,0 +1,134 @@ +// +// QueryParser.cc +// +// QueryParser: (abstract) root of the family of classes that create +// Query trees by analyzing query strings. +// The main public interface consists on Parse(), +// which does the job. +// The subclasses must provide a lexer. +// This class implements also the common behaviour needed to +// parse single words and phrases. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: QueryParser.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryParser.h" +#include "Query.h" +#include "htString.h" +#include "ExactWordQuery.h" +#include "PhraseQuery.h" +#include "FuzzyExpander.h" + +extern int debug; + +FuzzyExpander * +QueryParser::expander = 0; + +// +// parse a query string +// +// +Query * +QueryParser::Parse(const String &query_string) +{ + error = ""; + Token().Set(query_string); + + Query *result = ParseExpression(); + if(result && !Token().IsEnd()) + { + Expected("end of query"); + // delete result; + result = 0; + } + return result; +} + +// parse one word +// return a fuzzy word query +// +Query * +QueryParser::ParseWord() +{ + Query *result = 0; + if(expander) + { + result = expander->MakeQuery(Token().Value()); + } + else + { + result = new ExactWordQuery(Token().Value()); + } + Token().Next(); + return result; +} + +// +// parse one word +// return an exact query +// +Query * +QueryParser::ParseExactWord() +{ + Query *result = new ExactWordQuery(Token().Value()); + Token().Next(); + return result; +} + +// +// phrase == word { word } +// +Query * +QueryParser::ParsePhrase() +{ + Query *result = 0; + Query *word = 0; + if(!Token().IsEnd() && !Token().IsQuote()) + { + word = ParseExactWord(); + } + if(word) + { + result = new PhraseQuery; + result->Add(word); + while(word && !Token().IsEnd() && !Token().IsQuote()) + { + word = ParseExactWord(); + if(word) + { + result->Add(word); + } + } + } + if(!word && result) + { + delete result; + result = 0; + } + if(!result) + { + Expected("at least one word after \""); + } + return result; +} + +void +QueryParser::Expected(const String &what) +{ + error << "Expected " << what; + if(Token().IsEnd()) + { + error << " at the end"; + } + else + { + error << " instead of '" << Token().Value() << "'"; + } +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.h new file mode 100644 index 00000000..0af8ae30 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/QueryParser.h @@ -0,0 +1,75 @@ +#ifndef _QueryParser_h_ +#define _QueryParser_h_ + +// +// QueryParser.h +// +// QueryParser: (abstract) root of the family of classes that create +// Query trees by analyzing query strings. +// The main public interface consists on Parse(), +// which does the job. +// The subclasses must provide a lexer. +// This class implements also the common behaviour needed to +// parse single words and phrases. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: QueryParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryLexer.h" + +class Query; +class FuzzyExpander; + +// abstract +class QueryParser +{ +public: + virtual ~QueryParser() {} + + // do it + Query *Parse(const String &query_string); + + // contains a diagnostic if Parse() failed + const String &Error() const + { return error; } + + // set a fuzzy word expansion policy + static void SetFuzzyExpander(FuzzyExpander *x) + { expander = x; } + +protected: + QueryParser() {} + + // apply a syntax -- tbd by derived classes + virtual Query *ParseExpression() = 0; + + // access to the lexer -- provided by children + virtual QueryLexer &Token() = 0; + + // parse one (fuzzy) word + Query *ParseWord(); + + // parse an exact word + Query *ParseExactWord(); + + // parse a phrase + Query *ParsePhrase(); + + // set the error string on syntax error + void Expected(const String &what); + + // the current fuzzy expansion policy if some + static FuzzyExpander *expander; + +private: + // syntax error if some + String error; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.cc b/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.cc new file mode 100644 index 00000000..969c7bb0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.cc @@ -0,0 +1,151 @@ +// +// ResultList.cc +// +// ResultList: A Dictionary indexed on the document id that holds +// documents found for a search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultList.cc,v 1.10 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "ResultList.h" +#include "htString.h" + + +//***************************************************************************** +// ResultList::ResultList() +// +ResultList::ResultList() +{ + isIgnore = 0; +} + + +//***************************************************************************** +// ResultList::~ResultList() +// +ResultList::~ResultList() +{ + //Destroy(); +} + + +//***************************************************************************** +// +void +ResultList::add(DocMatch *dm) +{ + String t; + t << dm->GetId(); + Add(t, dm); +} + + +//***************************************************************************** +// +DocMatch * +ResultList::find(int id) const +{ + String t; + t << id; + return (DocMatch *) Find(t); +} + + +//***************************************************************************** +// +DocMatch * +ResultList::find(char *id) const +{ + return (DocMatch *) Find(id); +} + + +//***************************************************************************** +// +void +ResultList::remove(int id) +{ + String t; + t << id; + Remove(t); +} + + +//***************************************************************************** +// +int +ResultList::exists(int id) const +{ + String t; + t << id; + return Exists(t); +} + + +//***************************************************************************** +// +HtVector * +ResultList::elements() +{ + HtVector *list = new HtVector(Count() + 1); + char *id; + + Start_Get(); + while ((id = Get_Next())) + { + list->Add(Find(id)); + } + return list; +} + +void +ResultList::SetWeight(double weight) +{ + HtVector *els = elements(); + for(int i = 0; i < els->Count(); i++) + { + DocMatch *match = (DocMatch *)(*els)[i]; + match->SetWeight(weight); + } + els->Release(); +} + + +ResultList::ResultList(const ResultList &other) +{ + DictionaryCursor c; + isIgnore = other.isIgnore; + other.Start_Get(c); + DocMatch *match = (DocMatch *)other.Get_NextElement(c); + while(match) + { + add(new DocMatch(*match)); + match = (DocMatch *)other.Get_NextElement(c); + } +} + +void +ResultList::Dump() const +{ + cerr << "ResultList {" << endl; + cerr << "Ignore: " << isIgnore << " Count: " << Count() << endl; + DictionaryCursor c; + Start_Get(c); + DocMatch *match = (DocMatch *)Get_NextElement(c); + while(match) + { + match->Dump(); + match = (DocMatch *)Get_NextElement(c); + } + cerr << "}" << endl; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.h b/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.h new file mode 100644 index 00000000..5aa925ab --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ResultList.h @@ -0,0 +1,50 @@ +// +// ResultList.h +// +// ResultList: A Dictionary indexed on the document id that holds +// documents found for a search. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultList.h,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _ResultList_h_ +#define _ResultList_h_ + +#include "Dictionary.h" +#include "DocMatch.h" +#include "HtVector.h" + +class ResultList : public Dictionary +{ +public: + ResultList(); + ~ResultList(); + ResultList(const ResultList &); + + void add(DocMatch *); + void remove(int id); + DocMatch *find(int id) const; + DocMatch *find(char *id) const; + int exists(int id) const; + + HtVector *elements(); + + void SetWeight(double weight); + bool IsIgnore() const { return isIgnore != 0; } + void Ignore() { isIgnore = 1; } + + void Dump() const; +//private: + + int isIgnore; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.cc b/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.cc new file mode 100644 index 00000000..54e5f611 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.cc @@ -0,0 +1,296 @@ +// +// ResultMatch.cc +// +// ResultMatch: Contains information related to a given +// document that was matched by a search. For instance, the +// score of the document for this search. Similar to the +// DocMatch class but designed for result display purposes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultMatch.cc,v 1.10 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "ResultMatch.h" + +// Definition of how to search +ResultMatch::SortType ResultMatch::mySortType; + +//***************************************************************************** +// +ResultMatch::ResultMatch() +{ +} + + +//***************************************************************************** +// +ResultMatch::~ResultMatch() +{ +} + + +//***************************************************************************** +// Default-access-methods. Just dummies when that data is not used. +char *ResultMatch::getTitle() +{ return ""; } + +time_t ResultMatch::getTime() +{ return 0; } + +void ResultMatch::setTitle(char *) +{ } + +void ResultMatch::setTime(time_t) +{ } + +// Then for each sort-type, we derive a class, which will keep +// any necessary additional piece of data, and return the compare-function. + +// We could have a real cute implementation with global +// constructors registering a factory method with ResultMatch, +// so it would just check a list and never need to be changed +// when new search methods are introduced, but that seems futile. +// It is more practical to just add search methods here and +// change the createMatch method, last. + + +//***************************************************************************** +class ScoreMatch : public ResultMatch +{ + // This one needs no additional data +public: + virtual ResultMatch::CmpFun getSortFun(); + ScoreMatch(); + ~ScoreMatch(); +private: + static int compare(const void *a1, const void *a2); +}; + +ScoreMatch::ScoreMatch() {} +ScoreMatch::~ScoreMatch() {} + +int +ScoreMatch::compare(const void *a1, const void *a2) +{ + ResultMatch *m1 = *((ResultMatch **) a1); + ResultMatch *m2 = *((ResultMatch **) a2); + double score1 = m1->getScore(); + double score2 = m2->getScore(); + + if(score1 == score2) + return 0; + else if(score1 < score2) + return 1; + else + return -1; + + // return m2->getScore() - m1->getScore(); +} + +ResultMatch::CmpFun +ScoreMatch::getSortFun() { return compare; } + +//***************************************************************************** +class TimeMatch : public ResultMatch +{ +public: + virtual ResultMatch::CmpFun getSortFun(); + virtual void setTime(time_t); + virtual time_t getTime(); + TimeMatch(); + ~TimeMatch(); +private: + // We need a time_t here, and to override the get/setTime methods. + time_t myTime; + + static int compare(const void *a1, const void *a2); +}; + +TimeMatch::TimeMatch() {} +TimeMatch::~TimeMatch() {} + +void +TimeMatch::setTime(time_t t) +{ + myTime = t; +} + +time_t TimeMatch::getTime() +{ + return myTime; +} + +int +TimeMatch::compare(const void *a1, const void *a2) +{ + ResultMatch *m1 = *((ResultMatch **) a1); + ResultMatch *m2 = *((ResultMatch **) a2); + time_t t1 = m1->getTime(); + time_t t2 = m2->getTime(); + + return (int) (t2 - t1); +} + +ResultMatch::CmpFun +TimeMatch::getSortFun() { return compare; } + +//***************************************************************************** +class IDMatch : public ResultMatch +{ + // This one needs no additional data +public: + virtual ResultMatch::CmpFun getSortFun(); + IDMatch(); + ~IDMatch(); +private: + static int compare(const void *a1, const void *a2); +}; + +IDMatch::IDMatch() {} +IDMatch::~IDMatch() {} + +int +IDMatch::compare(const void *a1, const void *a2) +{ + ResultMatch *m1 = *((ResultMatch **) a1); + ResultMatch *m2 = *((ResultMatch **) a2); + int i1 = m1->getID(); + int i2 = m2->getID(); + + return (i1 - i2); +} + +ResultMatch::CmpFun +IDMatch::getSortFun() { return compare; } + +//***************************************************************************** +class TitleMatch : public ResultMatch +{ +public: + virtual ResultMatch::CmpFun getSortFun(); + virtual void setTitle(char *t); + virtual char *getTitle(); + TitleMatch(); + ~TitleMatch(); +private: + // We need a String here, and to override the get/setTitle methods. + // It has to be a String, as the "char *" goes away shortly + // after creating the object. + String myTitle; + + static int compare(const void *a1, const void *a2); +}; + +TitleMatch::TitleMatch() {} +TitleMatch::~TitleMatch() {} + +void +TitleMatch::setTitle(char *t) +{ + myTitle = t; +} + +char * +TitleMatch::getTitle() +{ + return myTitle; +} + +int +TitleMatch::compare(const void *a1, const void *a2) +{ + ResultMatch *m1 = *((ResultMatch **) a1); + ResultMatch *m2 = *((ResultMatch **) a2); + char *t1 = m1->getTitle(); + char *t2 = m2->getTitle(); + + if (!t1) t1 = ""; + if (!t2) t2 = ""; + return mystrcasecmp(t1, t2); +} + +ResultMatch::CmpFun +TitleMatch::getSortFun() { return compare; } + +//***************************************************************************** +int +ResultMatch::setSortType(const String& sorttype) +{ + static const struct + { + char *typest; + SortType type; + } + sorttypes[] = + { + {"score", SortByScore}, + {"date", SortByTime}, + {"time", SortByTime}, + {"title", SortByTitle}, + {"id", SortByID} + }; + int i = 0; + const char *st = sorttype; + if (st && *st) + { + if (mystrncasecmp("rev", st, 3) == 0) + st += 3; + for (i = sizeof(sorttypes)/sizeof(sorttypes[0]); --i >= 0; ) + { + if (mystrcasecmp(sorttypes[i].typest, st) == 0) + { + mySortType = sorttypes[i].type; + return 1; + } + } + return 0; + } + else + { + // If not specified, default to SortByScore + mySortType = SortByScore; + return 1; + } +} + +//***************************************************************************** +// Now here's the switchboard: a create-function that returns a +// "new":ed object of the right class for what to compare. +// To have the pairing managed in a (dynamically registered) +// list may seem interesting, but since everything is here +// anyway, there's little need but a small cuteness-factor. +// We could also change the guts to use some kind of creator +// object, if there would be a win. + +ResultMatch * +ResultMatch::create() +{ + switch (mySortType) + { + case ResultMatch::SortByScore: + return new ScoreMatch(); + + case ResultMatch::SortByTime: + return new TimeMatch(); + + case ResultMatch::SortByTitle: + return new TitleMatch(); + + case ResultMatch::SortByID: + return new IDMatch(); + + default: + // It is doubtful which is better: to abort() or paper + // over something bad here. + return new ScoreMatch(); + } +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.h b/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.h new file mode 100644 index 00000000..4cac3c5a --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/ResultMatch.h @@ -0,0 +1,89 @@ +// +// ResultMatch.h +// +// ResultMatch: Contains information related to a given +// document that was matched by a search. For instance, the +// score of the document for this search. Similar to the +// DocMatch class but designed for result display purposes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: ResultMatch.h,v 1.11 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _ResultMatch_h_ +#define _ResultMatch_h_ + +#include "Object.h" +#include "htString.h" + +class DocumentRef; +class Collection; + +class ResultMatch : public Object +{ +public: + // + // Construction/Destruction + // + ResultMatch(); + ~ResultMatch(); + static ResultMatch *create(); + // + // Data access members + // + void setAnchor(int a) {anchor = a;} + void setID(int i) {id = i;} + void setScore(double s) {score = s;} + + int getAnchor() {return anchor;} + double getScore() {return score;} + int getID() {return id;} + + // Multiple database support + void setCollection(Collection *coll) { collection = coll; } + Collection *getCollection() { return collection; } + + static int setSortType(const String& sorttype); + + // A method for each type of data Display wants to cram in. + // Will only store the pieces necessary for the + // search-type as defined in setSortType, the others are dummies. + virtual char *getTitle(); + virtual time_t getTime(); + + virtual void setTitle(char *title); + virtual void setTime(time_t t); + + // This is likely to help weak compilers as well as the eye. + typedef int (*CmpFun)(const void *, const void *); + + // The purpose of the derived classes is to define their own. + virtual CmpFun getSortFun() = 0; + + // Sun's C++ compiler doesn't like private types used in other structs + // so make this public + enum SortType + { + SortByScore, + SortByTime, + SortByTitle, + SortByID + }; + +private: + double score; + int anchor; + int id; + Collection *collection; + + static SortType mySortType; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/SimpleLexer.h b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleLexer.h new file mode 100644 index 00000000..54fbd8ea --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleLexer.h @@ -0,0 +1,29 @@ +#ifndef _SimpleLexer_h_ +#define _SimpleLexer_h_ + +// +// SimpleLexer.h +// +// SimpleLexer: query lexer for simple (no-keyword) queries +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SimpleLexer.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryLexer.h" + +class SimpleLexer : public QueryLexer +{ +public: + SimpleLexer() : QueryLexer() {} + + // everything is a word + bool IsWord() const { return !IsEnd(); } +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.cc new file mode 100644 index 00000000..ebe1901e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.cc @@ -0,0 +1,96 @@ +// +// SimpleQueryParser.cc +// +// SimpleQueryParser: (abstract) a family of parsers that generate queries +// for strings with the syntax (word|phrase){(word|phrase)} +// combining them in a single operator. +// The operator to apply is tbd by concrete classes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SimpleQueryParser.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "SimpleQueryParser.h" +#include "OperatorQuery.h" + +// +// expr == term { term } +// +Query * +SimpleQueryParser::ParseExpression() +{ + Query *result = 0; + Query *term = ParseTerm(); + if(term) + { + if(token.IsEnd()) + { + result = term; + } + else + { + result = MakeQuery(); + result->Add(term); + while(!token.IsEnd()) + { + term = ParseTerm(); + if(term) + { + result->Add(term); + } + } + } + } + if(!term) + { + delete result; + result = 0; + } + return result; +} + + +// +// term == word | '"' phrase '"' +// +Query * +SimpleQueryParser::ParseTerm() +{ + Query *result = 0; + + if(token.IsQuote()) + { + token.Next(); + result = ParsePhrase(); + if(result) + { + if(token.IsQuote()) + { + token.Next(); + } + else + { + Expected("closing \""); + delete result; + result = 0; + } + } + } + else if(token.IsWord()) + { + // don't advance token here! + result = ParseWord(); + } + else + { + Expected("a word or a quoted phrase"); + } + return result; +} + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.h b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.h new file mode 100644 index 00000000..93ff08ee --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/SimpleQueryParser.h @@ -0,0 +1,52 @@ +#ifndef _SimpleQueryParser_h_ +#define _SimpleQueryParser_h_ + +// +// SimpleQueryParser.h +// +// SimpleQueryParser: (abstract) a family of parsers that generate queries +// for strings with the syntax (word|phrase){(word|phrase)} +// combining them in a single operator. +// The operator to apply is tbd by concrete classes. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SimpleQueryParser.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "QueryParser.h" +#include "SimpleLexer.h" + +// abstract +class OperatorQuery; + +class SimpleQueryParser : public QueryParser +{ +public: + virtual ~SimpleQueryParser() {} + +protected: + SimpleQueryParser() {} + + // get a combination query + virtual OperatorQuery *MakeQuery() = 0; + +private: + // apply expr == term { term } + Query *ParseExpression(); + + // apply term == word | phrase + Query *ParseTerm(); + + // let the parent access the lexer + QueryLexer &Token() { return token; } + + // the used lexer + SimpleLexer token; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.cc b/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.cc new file mode 100644 index 00000000..6d7f97d8 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.cc @@ -0,0 +1,184 @@ +// +// SplitMatches.cc +// +// SplitMatches: +// Holds a list of lists with the matches, as specified in +// search_results_order. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: SplitMatches.cc,v 1.6 2004/05/28 13:15:24 lha Exp $ + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "StringList.h" +#include "HtRegex.h" +#include "SplitMatches.h" + +#include <stdio.h> +#include <ctype.h> + +// This class is only used in private members of SplitMatches. +// The OO-right thing would be to nest this inside the private +// declaration of SplitMatches, but that would cause portability +// problems according to +// <URL:http://www.mozilla.org/hacking/portable-cpp.html#inner_classes>. +// +// It is used as a container for a key (String) and a list. +// +class MatchArea : public Object +{ +public: + // Construct from a string applicable to StringMatch. + MatchArea(const String &); + + ~MatchArea(); + + // Does this item match? + // Fail if template is empty, since explicit "*" maps to empty template + inline bool Match(char *s) + { return match.match(s, 0, 0) != 0; } + + // Return the contained list. + List *MatchList() { return &myList; } + +private: + HtRegex match; + List myList; + + // These member functions are not supposed to be implemented, but + // mentioned here as private so the compiler will not generate them if + // someone puts in buggy code that would use them. + MatchArea(); + MatchArea(const MatchArea &); + void operator= (const MatchArea &); +}; + +MatchArea::MatchArea(const String &url_regex) +{ + // We do not want to "install" the catch-the-rest pattern as a real + // pattern; it must always return false for the "Match" operator. + if (strcmp("*", url_regex.get()) != 0) + { + StringList l(url_regex.get(),'|'); + match.setEscaped(l); + } +} + +MatchArea::~MatchArea() +{ +} + +SplitMatches::SplitMatches(Configuration &config) +{ + char *config_item = "search_results_order"; + + StringList sl(config[config_item], "\t \r\n"); + + mySubAreas = new List(); + myDefaultList = 0; + + // Parse each as in TemplateList::createFromString. + for (int i = 0; i < sl.Count(); i++) + { + String sub_area_pattern = sl[i]; + MatchArea *match_item = new MatchArea(sub_area_pattern); + mySubAreas->Add(match_item); + + // If this is the magic catch-rest sub-area-pattern, we want to + // use its list-pointer to store all URLs that do not match + // anything else. + // We will iterate over a list where one of the patterns is + // known to not match, but that's a small penalty for keeping + // the code simple. + if (strcmp("*", sub_area_pattern.get()) == 0) + myDefaultList = match_item->MatchList(); + } + + // If we did not have a catch-the-rest pattern, install one at the + // end of the list. + if (myDefaultList == 0) + { + MatchArea *match_item = new MatchArea(String("*")); + mySubAreas->Add(match_item); + + myDefaultList = match_item->MatchList(); + } +} + +SplitMatches::~SplitMatches() +{ + // myDefaultList is a pointer to one of the items in mySubAreas and + // must not be explicitly deleted here. + + delete mySubAreas; +} + +void +SplitMatches::Add(ResultMatch *match, char *url) +{ + List *area_list = mySubAreas; + MatchArea *area_item; + + area_list->Start_Get(); + + // This is a linear search. If there's a problem with that, we + // can improve it. For now, a list with tens of areas seems lots, + // and break-even with a more clever search-scheme is probably in + // the hundreds. + while ((area_item = (MatchArea *) area_list->Get_Next())) + { + // Use the first match only. + if (area_item->Match(url)) + { + area_item->MatchList()->Add(match); + return; + } + } + + // We'll get here if no match was found, so we add to the + // catch-the-rest list. + myDefaultList->Add(match); +} + +// Just a simple iterator function. +List * +SplitMatches::Get_Next() +{ + MatchArea *next_area = (MatchArea *) mySubAreas->Get_Next(); + List *next_area_list = 0; + + if (next_area != 0) + next_area_list = next_area->MatchList(); + + return next_area_list; +} + +// Rip out the sub-areas lists and concatenate them into one list. +List * +SplitMatches::JoinedLists() +{ + + // We make a new list here, so we don't have to worry about + // mySubAreas being dangling or null. + List *all_areas = new List(); + List *sub_areas = mySubAreas; + MatchArea *area; + + sub_areas->Start_Get(); + + while ((area = (MatchArea *) sub_areas->Get_Next())) + { + // "Destructively" move the contents of the list, + // leaving the original list empty. + all_areas->AppendList(*(area->MatchList())); + } + + return all_areas; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.h b/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.h new file mode 100644 index 00000000..2d42a441 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/SplitMatches.h @@ -0,0 +1,53 @@ +// +// SplitMatches.h +// +// SplitMatches: Constructed from a Configuration, see doc +// for format of config item "search_results_order". +// Used to contain a number of ResultMatches, putting them in separate +// lists depending on the URL with method Add. +// Iterator methods Get_First and Get_Next returns the sub-lists. +// Method Joined returns a new list with all the sub-lists +// concatenated. +// +// $Id: SplitMatches.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 2000-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +#ifndef _splitmatches_h +#define _splitmatches_h + +#include "Configuration.h" +#include "ResultMatch.h" +#include "List.h" + +class SplitMatches +{ +public: + SplitMatches(Configuration &); + ~SplitMatches(); + + void Add(ResultMatch *, char *); + List *JoinedLists(); + List *Get_First() + { mySubAreas->Start_Get(); return Get_Next(); } + + List *Get_Next(); + +private: + // These member functions are not supposed to be implemented. + SplitMatches(); + SplitMatches(const SplitMatches &); + void operator= (const SplitMatches &); + + // (Lists of) Matches for each sub-area regex. + List *mySubAreas; + + // Matches for everything else. + List *myDefaultList; +}; + +#endif /* _splitmatches_h */ diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Template.cc b/debian/htdig/htdig-3.2.0b6/htsearch/Template.cc new file mode 100644 index 00000000..d1d48095 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Template.cc @@ -0,0 +1,81 @@ +// +// Template.cc +// +// Template: A template to set the display of the search results. +// MatchTemplate is used for every match, Start and End templates +// are used between the header and the first match and the +// last match and the footer respectively. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Template.cc,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "Template.h" + +#include <stdio.h> + + +//***************************************************************************** +Template::Template() +{ +} + + +//***************************************************************************** +Template::~Template() +{ +} + + +//***************************************************************************** +// The start and end templates are created from the filename of the +// main template by appending ".start" and ".end" to the filename +// respectively. +// +void +Template::createFromFile(const char *filename) +{ + String realFile; + + realFile = filename; + realFile << ".start"; + readFile(startTemplate, (char*)realFile); + + realFile = filename; + realFile << ".end"; + readFile(endTemplate, (char*)realFile); + + readFile(matchTemplate, filename); +} + +//***************************************************************************** +// Append the contents of a file to a string. Nothing happens if the file +// doesn't exist. +// +void +Template::readFile(String &s, const char *filename) const +{ + FILE *fl = fopen(filename, "r"); + char buffer[1000]; + + if (!fl) + return; + s = 0; + while (fgets(buffer, sizeof(buffer), fl)) + { + s << buffer; + } + fclose(fl); +} + + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/Template.h b/debian/htdig/htdig-3.2.0b6/htsearch/Template.h new file mode 100644 index 00000000..5a7e6af3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/Template.h @@ -0,0 +1,54 @@ +// +// Template.h +// +// Template: A template to set the display of the search results. +// MatchTemplate is used for every match, Start and End templates +// are used between the header and the first match and the +// last match and the footer respectively. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: Template.h,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _Template_h_ +#define _Template_h_ + +#include "Object.h" +#include "htString.h" + +// +// This class holds information about output templates. +// +class Template : public Object +{ +public: + Template(); + ~Template(); + + const String& getMatchTemplate() const { return matchTemplate; } + const String& getStartTemplate() const { return startTemplate; } + const String& getEndTemplate() const { return endTemplate; } + + void setMatchTemplate(const char *s) { matchTemplate = s; } + void setStartTemplate(const char *s) { startTemplate = s; } + void setEndTemplate(const char *s) { endTemplate = s; } + + void createFromFile(const char *filename); + +protected: + String matchTemplate; + String startTemplate; + String endTemplate; + +private: + void readFile(String &, const char *) const; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.cc b/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.cc new file mode 100644 index 00000000..869f3fb1 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.cc @@ -0,0 +1,106 @@ +// +// TemplateList.cc +// +// +// TemplateList: As it sounds--a list of search result templates. Reads the +// configuration and any template files from disk, then retrieves +// the relevant template for display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: TemplateList.cc,v 1.11 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "TemplateList.h" +#include "URL.h" +#include "QuotedStringList.h" + +//***************************************************************************** +TemplateList::TemplateList() +{ +} + + +//***************************************************************************** +TemplateList::~TemplateList() +{ +} + + +//***************************************************************************** +// Return the template that belongs to the given internal template +// name. If no template can be found, NULL is returned. +// +Template * +TemplateList::get(const String& internalName) +{ + for (int i = 0; i < internalNames.Count(); i++) + { + const String *s = (const String *) internalNames[i]; + if (mystrcasecmp(*s, internalName) == 0) + return (Template *) templates[i]; + } + return 0; +} + + +//***************************************************************************** +// Create a list of templates from a configuration string. The string +// will have triplets of: display name, internal name, and filename. +// There are two special cases for the internal name: builtin-long and +// builtin-short. These will cause a hardcoded template to be +// created. All other templates are read in from the specified +// filename. +// +int +TemplateList::createFromString(const String& str) +{ + QuotedStringList sl(str, "\t \r\n"); + String display, internal, file; + Template *t; + + if ( sl.Count() % 3) return 0; // Make sure we have a multiple of three + + for (int i = 0; i < sl.Count(); i += 3) + { + display = sl[i]; + decodeURL(display); + internal = sl[i + 1]; + file = sl[i + 2]; + displayNames.Add(new String(display)); + internalNames.Add(new String(internal)); + + t = new Template(); + + if (mystrcasecmp((char*)file, "builtin-long") == 0) + { + String s; + s << "<dl><dt><strong><a href=\"$&(URL)\">$&(TITLE)</a></strong>"; + s << "$(STARSLEFT)\n"; + s << "</dt><dd>$(EXCERPT)<br>\n"; + s << "<em><a href=\"$&(URL)\">$&(URL)</a></em>\n"; + s << " <font size=\"-1\">$(MODIFIED), $(SIZE) bytes</font>\n"; + s << "</dd></dl>\n"; + t->setMatchTemplate((char*)s); + } + else if (mystrcasecmp((char*)file, "builtin-short") == 0) + { + t->setMatchTemplate("$(STARSRIGHT) <strong><a href=\"$&(URL)\">$&(TITLE)</a></strong><br>\n"); + } + else + { + t->createFromFile((char*)file); + } + templates.Add(t); + } + + return 1; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.h b/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.h new file mode 100644 index 00000000..f6986fc0 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/TemplateList.h @@ -0,0 +1,40 @@ +// +// TemplateList.h +// +// TemplateList: As it sounds--a list of search result templates. Reads the +// configuration and any template files from disk, then retrieves +// the relevant template for display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: TemplateList.h,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _TemplateList_h_ +#define _TemplateList_h_ + +#include "Template.h" +#include "Object.h" +#include "List.h" + +class TemplateList : public Object +{ +public: + TemplateList(); + ~TemplateList(); + + int createFromString(const String& str); + Template *get(const String& internalName); + + List displayNames; + List internalNames; + List templates; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.cc b/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.cc new file mode 100644 index 00000000..d3f21f5e --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.cc @@ -0,0 +1,77 @@ +// +// VolatileCache.cc +// +// VolatileCache: the simplest non-persistent Query result cache. +// This is default policy. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: VolatileCache.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "VolatileCache.h" +#include "ResultList.h" + +// +// a pseudo-constant empty result list +// used to avoid null pointers in the cache +// +ResultList theEmptyResult; + +ResultList * const +VolatileCache::empty = &theEmptyResult; + +extern int debug; + +// +// find a cache entry +// +ResultList * +VolatileCache::Lookup(const String &signature) +{ + ResultList *result = (ResultList *)cache[signature]; + return result; +} + +// +// add a cache entry +// +void +VolatileCache::Add(const String &signature, ResultList *entry) +{ + ResultList *previous = (ResultList *)cache[signature]; + if(previous && previous != empty) + { + delete previous; + } + if(!entry) + { + entry = empty; + } + cache.Add(signature, entry); +} + +// +// clear the in-memory cache +// avoids deletion of the shared 'empty' element +// +VolatileCache::~VolatileCache() +{ + if(debug) cerr << "query CLEAR: entries=" << cache.Count() << endl; + cache.Start_Get(); + ResultList *kill = (ResultList *)cache.Get_NextElement(); + while(kill) + { + if(kill != empty) + { + delete kill; + } + kill = (ResultList *)cache.Get_NextElement(); + } + cache.Release(); +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.h b/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.h new file mode 100644 index 00000000..c57d09d3 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/VolatileCache.h @@ -0,0 +1,44 @@ +#ifndef _VolatileCache_h_ +#define _VolatileCache_h_ + +// +// VolatileCache.h +// +// VolatileCache: the simplest non-persistent Query result cache. +// This is default policy. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: VolatileCache.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "QueryCache.h" +#include "Dictionary.h" + +class VolatileCache : public QueryCache +{ +public: + // cons & destr + VolatileCache() {} + ~VolatileCache(); + + // get cached result from in-memory cache + ResultList *Lookup(const String &signature); + + // add result to in-memory cache + void Add(const String &signature, ResultList *entry); + +private: + Dictionary cache; + static ResultList * const empty; +}; + +#endif diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.cc b/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.cc new file mode 100644 index 00000000..49eb2e03 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.cc @@ -0,0 +1,146 @@ +// +// WeightWord.cc +// +// WeightWord: Contains the information necessary for a particular search word +// including the resulting weight (scaling factor) and +// whether the word should be hidden (ignored). +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WeightWord.cc,v 1.10 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "WeightWord.h" + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +//*************************************************************************** +// WeightWord::WeightWord() +// +WeightWord::WeightWord() +{ + weight = 1; + records = 0; + isExact = 0; + isHidden = 0; + isIgnore = 0; + + flags = FLAGS_MATCH_ONE; +} + + +//*************************************************************************** +// WeightWord::WeightWord(WeightWord *ww) +// +WeightWord::WeightWord(WeightWord *ww) +{ + weight = ww->weight; + records = ww->records; + isExact = ww->isExact; + isHidden = ww->isHidden; + flags = ww->flags; + word = ww->word; + isIgnore = 0; +} + + +//*************************************************************************** +// WeightWord::WeightWord(char *word, double weight) +// +WeightWord::WeightWord(char *word, double weight) +{ + records = 0; + isExact = 0; + isHidden = 0; + isIgnore = 0; + + // allow a match with any field + flags = FLAGS_MATCH_ONE; + + set(word); + this->weight = weight; +} + +//*************************************************************************** +// WeightWord::WeightWord(char *word, double weight, unsigned int f) +// +WeightWord::WeightWord(char *word, double weight, unsigned int f) +{ + records = 0; + + flags = f; + // if no fields specified, allow a match with any field + if (!(flags & FLAGS_MATCH_ONE)) + flags ^= FLAGS_MATCH_ONE; + + // ideally, these flags should all just be stored in a uint... + isExact = ((flags & FLAG_EXACT) != 0); + isHidden = ((flags & FLAG_HIDDEN) != 0); + isIgnore = ((flags & FLAG_IGNORE) != 0); + + set(word); + this->weight = weight; +} + + +//*************************************************************************** +// WeightWord::~WeightWord() +// +WeightWord::~WeightWord() +{ +} + + +//*************************************************************************** +// void WeightWord::set(char *word) +// +void WeightWord::set(char *word) +{ +#if 0 + isExact = 0; + isHidden = 0; + while (strchr(word, ':')) + { + // + // This word contains modifiers. + // + if (mystrncasecmp(word, "exact:", 6) == 0) + { + word += 6; + isExact = 1; + } + else if (mystrncasecmp(word, "hidden:", 7) == 0) + { + word += 7; + isHidden = 1; + } + else + { + // + // There is a ':' but not a valid attribute. It must be part + // of the word we are searching for. + // + break; + } + + } +#endif + this->word = word; + this->word.lowercase(); +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.h b/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.h new file mode 100644 index 00000000..313ea362 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/WeightWord.h @@ -0,0 +1,50 @@ +// +// WeightWord.h +// +// WeightWord: Contains the information necessary for a particular search word +// including the resulting weight (scaling factor) and +// whether the word should be hidden (ignored). +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WeightWord.h,v 1.8 2004/05/28 13:15:24 lha Exp $ +// + +#ifndef _WeightWord_h_ +#define _WeightWord_h_ + +#include "htString.h" +#include "WordRecord.h" +#include "HtWordReference.h" // for FLAG_... + +class WeightWord : public Object +{ +public: + // + // Construction/Destruction + // + WeightWord(); + WeightWord(char *word, double weight); + WeightWord(char *word, double weight, unsigned int flags); + WeightWord(WeightWord *); + + virtual ~WeightWord(); + + void set(char *word); + + String word; + double weight; + WordRecord *records; + unsigned int flags; + short int isExact; + short int isHidden; + short int isIgnore; +}; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.cc b/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.cc new file mode 100644 index 00000000..7e1669f7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.cc @@ -0,0 +1,109 @@ +// +// WordSearcher.cc +// +// WordSearcher: a simple word database readonly-access wrapper +// generates ResultLists for the Query framework. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordSearcher.cc,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#include "WordSearcher.h" +#include "WordType.h" +#include "ResultList.h" +#include "HtWordReference.h" +#include "defaults.h" + +extern int debug; + +// +// constructor, opens the database +// +WordSearcher::WordSearcher(const String &filename) : + references(*(HtConfiguration::config())) +{ + references.Open(filename, O_RDONLY); +} + +// +// gather results for a word, either from db or ignored +// +ResultList * +WordSearcher::Search(const String &word) +{ + ResultList *result = 0; + if(IsIgnore(word)) + { + if(debug) cerr << "IGNORE: " << word << endl; + result = new ResultList; + result->Ignore(); + } + else + { + result = Fetch(word); + } + return result; +} + +// +// see if word must be ignored +// +bool +WordSearcher::IsIgnore(const String &word) +{ + HtConfiguration* config= HtConfiguration::config(); + String copy = word; + WordType type(*config); + return 0 != type.Normalize(copy); +} + +// +// gather all references in the db, construct a ResultList +// +ResultList * +WordSearcher::Fetch(const String &word) +{ + if(debug) cerr << "FETCH: " << word << endl; + ResultList *result = 0; + List *refs = references[word]; + + if(refs && refs->Count()) + { + if(debug) cerr << "REFERENCES: " << refs->Count() << endl; + result = new ResultList; + DocMatch *match = new DocMatch; + + refs->Start_Get(); + HtWordReference *ref = (HtWordReference *)refs->Get_Next(); + match->SetId(ref->DocID()); + match->SetAnchor(ref->Anchor()); + result->add(match); + unsigned int current = ref->DocID(); + if(debug) cerr << "At: " << ref->DocID() << endl; + while(ref) + { + if(ref->DocID() != current) + { + if(debug) cerr << "At: "<<ref->DocID()<< endl; + match = new DocMatch; + match->SetId(ref->DocID()); + match->SetAnchor(ref->Anchor()); + result->add(match); + current = ref->DocID(); + } + if(debug) cerr << "@ "<<ref->Location()<< endl; + match->AddLocation( + new Location( + ref->Location(), + ref->Location(), + ref->Flags())); + ref = (HtWordReference *)refs->Get_Next(); + } + } + return result; +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.h b/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.h new file mode 100644 index 00000000..2ef656c9 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/WordSearcher.h @@ -0,0 +1,49 @@ +#ifndef _WordSearcher_h_ +#define _WordSearcher_h_ + +// +// WordSearcher.h +// +// WordSearcher: a simple word database readonly-access wrapper +// generates ResultLists for the Query framework. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: WordSearcher.h,v 1.4 2004/05/28 13:15:24 lha Exp $ +// + +#if HAVE_CONFIG_H +#include "htconfig.h" +#endif + +#include "htString.h" +#include "HtWordList.h" + +class ResultList; + +class WordSearcher +{ +public: + // constructor + WordSearcher(const String &filename); + + // fetch results for one exact word + ResultList *Search(const String &word); + +private: + // word is to be ignored + bool IsIgnore(const String &word); + + // fetch results in database + ResultList *Fetch(const String &word); + + // the database wrapper + HtWordList references; +}; + + +#endif /* _WordSearcher_h_ */ diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.cc b/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.cc new file mode 100644 index 00000000..8c410784 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.cc @@ -0,0 +1,957 @@ +// +// htsearch.cc +// +// htsearch: The main search CGI. Parses the CGI input, reads the config files +// and calls the necessary code to put together the result lists +// and the final display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htsearch.cc,v 1.72 2004/05/28 13:15:24 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "htsearch.h" +#include "WeightWord.h" +#include "parser.h" +#include "Display.h" +#include "../htfuzzy/Fuzzy.h" +#include "cgi.h" +#include "WordRecord.h" +#include "HtWordList.h" +#include "StringList.h" +#include "IntObject.h" +#include "HtURLCodec.h" +#include "HtURLRewriter.h" +#include "WordContext.h" +#include "HtRegex.h" +#include "Collection.h" + +#include <time.h> +#include <ctype.h> +#include <signal.h> + + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#elif HAVE_GETOPT_LOCAL +#include <getopt_local.h> +#endif + +typedef void (*SIGNAL_HANDLER) (...); + +// ResultList *htsearch(const String&, List &, Parser *); +void htsearch(Collection *, List &, Parser *); + +void setupWords(char *, List &, int, Parser *, String &); +void createLogicalWords(List &, String &, String &); +void reportError(char *); +void convertToBoolean(List &words); +void doFuzzy(WeightWord *, List &, List &); +void addRequiredWords(List &, StringList &); +void usage(); + +int debug = 0; +int minimum_word_length = 3; +StringList boolean_keywords; + +StringList collectionList; // List of databases to search on + +// reconised word prefixes (for field-restricted search and per-word fuzzy +// algorithms) in *descending* alphabetical order. +// Don't use a dictionary structure, as setup time outweights saving. +struct {char *name; unsigned int flag; } colonPrefix [] = +{ + { "url", FLAG_URL }, + { "title", FLAG_TITLE }, + { "text", FLAG_PLAIN }, // FLAG_TEXT is 0, i.e. *no* flag... + { "link", FLAG_LINK_TEXT }, + { "keyword", FLAG_KEYWORDS }, + { "hidden", FLAG_HIDDEN }, + { "heading", FLAG_HEADING }, + { "exact", FLAG_EXACT }, + { "descr", FLAG_DESCRIPTION }, +// { "cap", FLAG_CAPITAL }, + { "author", FLAG_AUTHOR }, + { "", 0 }, +}; + +//***************************************************************************** +// int main() +// +int +main(int ac, char **av) +{ + int c; + extern char *optarg; + int override_config=0; + // List searchWords; + List *searchWords = NULL; + String configFile = DEFAULT_CONFIG_FILE; + int pageNumber = 1; + HtRegex limit_to; + HtRegex exclude_these; + String logicalWords; + String origPattern; + String logicalPattern; + // StringMatch searchWordsPattern; + StringMatch *searchWordsPattern = NULL; + StringList requiredWords; + int i; + Dictionary selected_collections; // Multiple database support + + // + // Parse command line arguments + // + while ((c = getopt(ac, av, "c:dv")) != -1) + { + switch (c) + { + case 'c': + // The default is obviously to do this securely + // but if people want to shoot themselves in the foot... +#ifndef ALLOW_INSECURE_CGI_CONFIG + if (!getenv("REQUEST_METHOD")) + { +#endif + configFile = optarg; + override_config=1; +#ifndef ALLOW_INSECURE_CGI_CONFIG + } +#endif + break; + case 'v': + debug++; + break; + case 'd': + debug++; + break; + case '?': + usage(); + break; + } + } + + // + // The total search can NEVER take more than 5 minutes. + // +#ifndef _MSC_VER /* _WIN32 */ + alarm(5 * 60); +#endif + + // + // Parse the CGI parameters. + // + char none[] = ""; + cgi input(optind < ac ? av[optind] : none); + + // Multiple databases may be specified for search. + // Identify all databases specified with the "config=" parameter. + if (input.exists("config")) + { + collectionList.Create(input["config"], " \t\001|"); + } + if (collectionList.Count() == 0) + collectionList.Add(""); // use default config + String errorMsg = ""; + String originalWords = input["words"]; + originalWords.chop(" \t\r\n"); + + HtConfiguration* config= HtConfiguration::config(); + + // Iterate over all specified collections (databases) + for (int cInd=0; errorMsg.empty() && cInd < collectionList.Count(); cInd++) + { + // Each collection is handled in an iteration. Reset the following so + // that we start with a clean slate. + // + logicalWords = 0; + origPattern = 0; + logicalPattern = 0; + searchWords = new List; + searchWordsPattern = new StringMatch; + + char *config_name = collectionList[cInd]; + if (config_name && config_name[0] == '\0') + config_name = NULL; // use default config + + // + // Setup the configuration database. First we read the compiled defaults. + // Then we override those with defaults read in from the configuration + // file, and finally we override some attributes with information we + // got from the HTML form. + // + config->Defaults(&defaults[0]); + // To allow . in filename while still being 'secure', + // e.g. htdig-f.q.d.n.conf + if (!override_config && config_name + && (strstr(config_name, "./") == NULL)) + { + char *configDir = getenv("CONFIG_DIR"); + if (configDir) + { + configFile = configDir; + } + else + { + configFile = CONFIG_DIR; + } + if (strlen(config_name) == 0) + configFile = DEFAULT_CONFIG_FILE; + else + configFile << '/' << config_name << ".conf"; + } + if (access((char*)configFile, R_OK) < 0) + { + reportError(form("Unable to read configuration file '%s'", + configFile.get())); + } + config->Read(configFile); + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(*config); + + if (input.exists("method")) + config->Add("match_method", input["method"]); + if (input.exists("format")) + config->Add("template_name", input["format"]); + + if (input.exists("matchesperpage")) + { + // minimum check for a valid int value of "matchesperpage" cgi variable + if (atoi(input["matchesperpage"]) > 0) + config->Add("matches_per_page", input["matchesperpage"]); + } + + if (input.exists("page")) + pageNumber = atoi(input["page"]); + if (input.exists("config")) + config->Add("config", input["config"]); + if (input.exists("restrict")) + config->Add("restrict", input["restrict"]); + if (input.exists("exclude")) + config->Add("exclude", input["exclude"]); + if (input.exists("keywords")) + config->Add("keywords", input["keywords"]); + requiredWords.Create(config->Find("keywords"), " \t\r\n\001"); + if (input.exists("sort")) + config->Add("sort", input["sort"]); + + // Changes added 3-31-99, by Mike Grommet + // Check form entries for starting date, and ending date + // Each date consists of a month, day, and year + + if (input.exists("startmonth")) + config->Add("startmonth", input["startmonth"]); + if (input.exists("startday")) + config->Add("startday", input["startday"]); + if (input.exists("startyear")) + config->Add("startyear", input["startyear"]); + + if (input.exists("endmonth")) + config->Add("endmonth", input["endmonth"]); + if (input.exists("endday")) + config->Add("endday", input["endday"]); + if (input.exists("endyear")) + config->Add("endyear", input["endyear"]); + + // END OF CHANGES BY MIKE GROMMET + + + minimum_word_length = config->Value("minimum_word_length", minimum_word_length); + + StringList form_vars(config->Find("allow_in_form"), " \t\r\n"); + for (i= 0; i < form_vars.Count(); i++) + { + if (input.exists(form_vars[i])) + config->Add(form_vars[i], input[form_vars[i]]); + } + + // + // Compile the URL limit patterns. + // + + if (config->Find("restrict").length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(config->Find("restrict"), " \t\r\n\001|"); + limit_to.setEscaped(l); + String u = l.Join('|'); + config->Add("restrict", u); // re-create the config attribute + } + if (config->Find("exclude").length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(config->Find("exclude"), " \t\r\n\001|"); + exclude_these.setEscaped(l); + String u = l.Join('|'); + config->Add("exclude", u); // re-create the config attribute + } + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance()->ErrMsg(); + + if (url_part_errors.length() != 0) + reportError(form("Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get())); + + // for htsearch, use search_rewrite_rules attribute for HtURLRewriter. + config->AddParsed("url_rewrite_rules", "${search_rewrite_rules}"); + url_part_errors = HtURLRewriter::instance()->ErrMsg(); + if (url_part_errors.length() != 0) + reportError(form("Invalid url_rewrite_rules: %s", + url_part_errors.get())); + + // Load boolean_keywords from configuration + // they should be placed in this order: + // 0 1 2 + // and or not + boolean_keywords.Destroy(); + boolean_keywords.Create(config->Find("boolean_keywords"), + "| \t\r\n\001"); + if (boolean_keywords.Count() != 3) + reportError("boolean_keywords attribute should have three entries"); + + Parser *parser = new Parser(); + + // + // Parse the words to search for from the argument list. + // This will produce a list of WeightWord objects. + // + setupWords(originalWords, *searchWords, + strcmp(config->Find("match_method"), "boolean") == 0, + parser, origPattern); + + // + // Convert the list of WeightWord objects to a pattern string + // that we can compile. + // + createLogicalWords(*searchWords, logicalWords, logicalPattern); + + // + // Assemble the full pattern for excerpt matching and highlighting + // + origPattern += logicalPattern; + searchWordsPattern->IgnoreCase(); + searchWordsPattern->IgnorePunct(); + searchWordsPattern->Pattern(logicalPattern); // this should now be enough + //searchWordsPattern.Pattern(origPattern); + //if (debug > 2) + // cout << "Excerpt pattern: " << origPattern << "\n"; + + // + // If required keywords were given in the search form, we will + // modify the current searchWords list to include the required + // words. + // + if (requiredWords.Count() > 0) + { + addRequiredWords(*searchWords, requiredWords); + } + + // + // Perform the actual search. The function htsearch() is used for this. + // The Dictionary it returns is then passed on to the Display object to + // actually render the results in HTML. + // + const String word_db = config->Find("word_db"); + if (access(word_db, R_OK) < 0) + { + reportError(form("Unable to read word database file '%s'\nDid you run htdig?", + word_db.get())); + } + // ResultList *results = htsearch((char*)word_db, searchWords, parser); + + String doc_index = config->Find("doc_index"); + if (access((char*)doc_index, R_OK) < 0) + { + reportError(form("Unable to read document index file '%s'\nDid you run htdig?", + doc_index.get())); + } + + const String doc_db = config->Find("doc_db"); + if (access(doc_db, R_OK) < 0) + { + reportError(form("Unable to read document database file '%s'\nDid you run htdig?", + doc_db.get())); + } + + const String doc_excerpt = config->Find("doc_excerpt"); + if (access(doc_excerpt, R_OK) < 0) + { + reportError(form("Unable to read document excerpts '%s'\nDid you run htdig?", + doc_excerpt.get())); + } + + // Multiple database support + Collection *collection = new Collection((char*)configFile, + word_db.get(), doc_index.get(), doc_db.get(), doc_excerpt.get()); + + // Perform search within the collection. Each collection stores its + // own result list. + htsearch(collection, *searchWords, parser); + collection->setSearchWords(searchWords); + collection->setSearchWordsPattern(searchWordsPattern); + selected_collections.Add(configFile, collection); + + if (parser->hadError()) + errorMsg = parser->getErrorMessage(); + + delete parser; + } + + // Display display(doc_db, 0, doc_excerpt); + Display display(&selected_collections); + if (display.hasTemplateError()) + { + reportError(form("Unable to read template file '%s'\nDoes it exist?", + (const char*)config->Find("template_name"))); + return 0; + } + display.setOriginalWords(originalWords); + // display.setResults(results); + // display.setSearchWords(&searchWords); + display.setLimit(&limit_to); + display.setExclude(&exclude_these); + // display.setAllWordsPattern(searchWordsPattern); + display.setCGI(&input); + display.setLogicalWords(logicalWords); + if (!errorMsg.empty()) + display.displaySyntaxError(errorMsg); + else + display.display(pageNumber); + + // delete results; + // delete parser; + return 0; +} + +//***************************************************************************** +void +createLogicalWords(List &searchWords, String &logicalWords, String &wm) +{ + String pattern; + int i; + int wasHidden = 0; + int inPhrase = 0; + + for (i = 0; i < searchWords.Count(); i++) + { + WeightWord *ww = (WeightWord *) searchWords[i]; + if (!ww->isHidden) + { + if (strcmp((char*)ww->word, "&") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[AND] << ' '; + else if (strcmp((char*)ww->word, "|") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[OR] << ' '; + else if (strcmp((char*)ww->word, "!") == 0 && wasHidden == 0) + logicalWords << ' ' << boolean_keywords[NOT] << ' '; + else if (strcmp((char*)ww->word, "\"") == 0 && wasHidden == 0) + { + if (inPhrase) + logicalWords.chop(' '); + inPhrase = !inPhrase; + logicalWords << "\""; + } + else if (wasHidden == 0) + { + logicalWords << ww->word; + if (inPhrase) + logicalWords << " "; + } + wasHidden = 0; + } + else + wasHidden = 1; + // generate patterns to search for and highlight in excerpt + if (ww->weight > 0 // Ignore boolean syntax stuff + && (!ww->isIgnore || inPhrase)) // Ignore bad/short words + { // but highlight them in phrases + char spacer = inPhrase ? ' ' : '|'; + if (wm.length()) + wm << spacer; + wm << ww->word; + if (!ww->isIgnore) // ignore bad/short words for searching + { + if (pattern.length()) + pattern << spacer; + pattern << ww->word; + } + } + } + + if (debug) + { + cerr << "LogicalWords: " << logicalWords << endl; + cerr << "Pattern: " << pattern << endl; + cerr << "Highlight Pattern: " << wm << endl; + } +} + +void +dumpWords(List &words, char *msg = "") +{ + if (debug) + { + cerr << msg << ": '"; + for (int i = 0; i < words.Count(); i++) + { + WeightWord *ww = (WeightWord *) words[i]; + cerr << ww->word << ':' << ww->isHidden << ' '; + } + cerr << "'\n"; + } +} + +//***************************************************************************** +// void setupWords(char *allWords, List &searchWords, +// int boolean, Parser *parser, String &originalPattern) +// +void +setupWords(char *allWords, List &searchWords, int boolean, Parser *parser, + String &originalPattern) +{ + HtConfiguration* config= HtConfiguration::config(); + List tempWords; + int i; + + // + // Parse the words we need to search for. It should be a list of words + // with optional 'and' and 'or' between them. The list of words + // will be put in the searchWords list and at the same time in the + // String pattern separated with '|'. + // + + // + // Convert the string to a list of WeightWord objects. The special + // characters '(' and ')' will be put into their own WeightWord objects. + // + unsigned char *pos = (unsigned char*) allWords; + unsigned char t; + String word; + const String prefix_suffix = config->Find("prefix_match_character"); + + while (*pos) + { + while (1) + { + if (debug > 3) + cerr << "setupWords: " << pos << endl; + t = *pos++; + if (isspace(t)) + { + continue; + } + else if (t == '"') + { + tempWords.Add(new WeightWord("\"", -1.0)); + break; + } + else if (boolean && (t == '(' || t == ')')) + { + char s[2]; + s[0] = t; + s[1] = '\0'; + tempWords.Add(new WeightWord(s, -1.0)); + break; + } + else if (HtIsWordChar(t) || + (strchr(prefix_suffix, t) != NULL) || + (t >= 161 && t <= 255)) + { + unsigned int fieldFlag = 0; + word = 0; + do // while recognised prefix, followed by ':' + { + while (t && (HtIsWordChar(t) || + (strchr(prefix_suffix, t) != NULL) || + (t >= 161 && t <= 255))) + { + word << (char) t; + t = *pos++; + } + if (debug > 2) + cerr << "word: " << word << endl; + if (t == ':') // e.g. "author:word" to search + { // only in author + word.lowercase(); + t = *pos++; + if (t && (HtIsWordChar (t) || + (strchr(prefix_suffix, t) != NULL) || + (t >= 161 && t <= 255))) + { + int i, cmp; + const char *w = word.get(); + // linear search of known prefixes, with "" flag. + for (i = 0; (cmp = mystrcasecmp (w, colonPrefix[i].name)) < 0; i++) + ; + if (debug > 2) + cerr << "field: "<< colonPrefix[i].name << endl; + if (cmp == 0) // if prefix found... + { + fieldFlag |= colonPrefix [i].flag; + word = 0; + } + } + } + } while (!word.length() && t); + pos--; + if (!t && !word.length()) // query ended with junk chars + break; + + if (boolean && (mystrcasecmp(word.get(), "+") == 0 + || mystrcasecmp(word.get(), boolean_keywords[AND]) == 0)) + { + tempWords.Add(new WeightWord("&", -1.0)); + } + else if (boolean && + mystrcasecmp(word.get(), boolean_keywords[OR]) == 0) + { + tempWords.Add(new WeightWord("|", -1.0)); + } + else if (boolean && (mystrcasecmp(word.get(), "-") == 0 + || mystrcasecmp(word.get(), boolean_keywords[NOT]) == 0)) + { + tempWords.Add(new WeightWord("!", -1.0)); + } + else + { + // Add word to excerpt matching list + originalPattern << word << "|"; + WeightWord *ww = new WeightWord(word, 1.0, fieldFlag); + if(HtWordNormalize(word) & WORD_NORMALIZE_NOTOK) + ww->isIgnore = 1; + tempWords.Add(ww); + } + break; + } + } + } + + dumpWords(tempWords, "tempWords"); + + // + // If the user specified boolean expression operators, the whole + // expression has to be syntactically correct. If not, we need + // to report a syntax error. + // + if (boolean) + { + if (!parser->checkSyntax(&tempWords)) + { + for (i = 0; i < tempWords.Count(); i++) + { + searchWords.Add(tempWords[i]); + } + tempWords.Release(); + return; +// reportError("Syntax error"); + } + } + else + { + convertToBoolean(tempWords); + } + + dumpWords(tempWords, "Boolean"); + + // + // We need to assign weights to the words according to the search_algorithm + // configuration attribute. + // For algorithms other than exact, we need to also do word lookups. + // + StringList algs(config->Find("search_algorithm"), " \t"); + List algorithms; + String name, weight; + double fweight; + Fuzzy *fuzzy = 0; + + // + // Generate the list of algorithms to use and associate the given + // weights with them. + // + for (i = 0; i < algs.Count(); i++) + { + name = strtok(algs[i], ":"); + weight = strtok(0, ":"); + if (name.length() == 0) + name = "exact"; + if (weight.length() == 0) + weight = "1"; + fweight = atof((char*)weight); + + fuzzy = Fuzzy::getFuzzyByName(name, *config); + if (fuzzy) + { + if (debug > 1) + cerr << "Adding algorithm " << name.get() << endl; + fuzzy->setWeight(fweight); + fuzzy->openIndex(); + algorithms.Add(fuzzy); + } else if (debug) + cerr << "Unknown fuzzy search algorithm " << name.get() << endl; + } + + dumpWords(searchWords, "initial"); + + // + // For each of the words, apply all the algorithms. + // + int in_phrase = 0; // If we get into a phrase, we don't want to fuzz. + for (i = 0; i < tempWords.Count(); i++) + { + WeightWord *ww = (WeightWord *) tempWords[i]; + if (ww->weight > 0 && !ww->isIgnore && !in_phrase) +// I think that should be: +// if (ww->weight > 0 && !ww->isIgnore && !in_phrase && !ww->isExact) + { + // + // Apply all the algorithms to the word. + // + if (debug) + cerr << "Fuzzy on: " << ww->word << endl; + doFuzzy(ww, searchWords, algorithms); + delete ww; + } + else if (ww->word.length() == 1 && ww->word[0] == '"') + { + in_phrase = !in_phrase; + if (debug) + cerr << "Add: " << ww->word << endl; + searchWords.Add(ww); + } + else + { + // + // This is '(', ')', '&', or '|'. These will be automatically + // transfered to the searchWords list. + // + if (debug) + cerr << "Add: " << ww->word << endl; + searchWords.Add(ww); + } + dumpWords(searchWords, "searchWords"); + } + tempWords.Release(); +} + + +//***************************************************************************** +void +doFuzzy(WeightWord *ww, List &searchWords, List &algorithms) +{ + List fuzzyWords; + List weightWords; + Fuzzy *fuzzy = 0; + WeightWord *newWw = 0; + String *word = 0; + + algorithms.Start_Get(); + while ((fuzzy = (Fuzzy *) algorithms.Get_Next())) + { + if (debug > 1) + cerr << " " << fuzzy->getName(); + fuzzy->getWords(ww->word, fuzzyWords); + fuzzyWords.Start_Get(); + while ((word = (String *) fuzzyWords.Get_Next())) + { + if (debug > 1) + cerr << " " << word->get(); + // (should be a "copy with changed weight" constructor...) + newWw = new WeightWord(word->get(), fuzzy->getWeight()); + newWw->isExact = ww->isExact; + newWw->isHidden = ww->isHidden; + newWw->flags = ww->flags; + weightWords.Add(newWw); + } + if (debug > 1) + cerr << endl; + fuzzyWords.Destroy(); + } + + // + // We now have a list of substitute words. They need to be added + // to the searchWords. + // + if (weightWords.Count()) + { + if (weightWords.Count() > 1) + searchWords.Add(new WeightWord("(", -1.0)); + for (int i = 0; i < weightWords.Count(); i++) + { + if (i > 0) + searchWords.Add(new WeightWord("|", -1.0)); + searchWords.Add(weightWords[i]); + } + if (weightWords.Count() > 1) + searchWords.Add(new WeightWord(")", -1.0)); + } + else // if no fuzzy matches, add exact word, but give it tiny weight + { + searchWords.Add(new WeightWord(ww->word.get(), 0.000001)); + } + weightWords.Release(); +} + + +//***************************************************************************** +// void convertToBoolean(List &words) +// +void +convertToBoolean(List &words) +{ + HtConfiguration* config= HtConfiguration::config(); + List list; + int i; + int do_and = strcmp(config->Find("match_method"), "and") == 0; + int in_phrase = 0; + + String quote = "\""; + + if (words.Count() == 0) + return; + list.Add(words[0]); + + // We might start off with a phrase match + if (((WeightWord *) words[0])->word == quote) + in_phrase = 1; + + for (i = 1; i < words.Count(); i++) + { + if (do_and && !in_phrase) + list.Add(new WeightWord("&", -1.0)); + else if (!in_phrase) + list.Add(new WeightWord("|", -1.0)); + + if (((WeightWord *) words[i])->word == quote) + in_phrase = !in_phrase; + + list.Add(words[i]); + } + words.Release(); + + for (i = 0; i < list.Count(); i++) + { + words.Add(list[i]); + } + list.Release(); +} + + +//***************************************************************************** +// Dictionary *htsearch(char *wordfile, List &searchWords, Parser *parser) +// This returns a dictionary indexed by document ID and containing a +// List of HtWordReference objects. +// +void +htsearch(Collection *collection, List &searchWords, Parser *parser) +{ + // + // Pick the database type we are going to use + // + ResultList *matches = new ResultList; + if (searchWords.Count() > 0) + { + // parser->setDatabase(wordfile); + parser->setCollection(collection); + parser->parse(&searchWords, *matches); + } + + collection->setResultList(matches); + // return matches; +} + + +//***************************************************************************** +// Modify the search words list to include the required words as well. +// This is done by putting the existing search words in parenthesis and +// appending the required words separated with "and". +void +addRequiredWords(List &searchWords, StringList &requiredWords) +{ + HtConfiguration* config= HtConfiguration::config(); + static int any_keywords = config->Boolean("any_keywords", 0); + if (requiredWords.Count() == 0) + return; + if (searchWords.Count() > 0) + { + searchWords.Insert(new WeightWord("(", -1.0), 0); + searchWords.Add(new WeightWord(")", -1.0)); + searchWords.Add(new WeightWord("&", -1.0)); + } + if (requiredWords.Count() == 1) + { + searchWords.Add(new WeightWord(requiredWords[0], 1.0)); + } + else + { + searchWords.Add(new WeightWord("(", -1.0)); + searchWords.Add(new WeightWord(requiredWords[0], 1.0)); + for (int i = 1; i < requiredWords.Count(); i++) + { + if (any_keywords) + searchWords.Add(new WeightWord("|", -1.0)); + else + searchWords.Add(new WeightWord("&", -1.0)); + searchWords.Add(new WeightWord(requiredWords[i], 1.0)); + } + searchWords.Add(new WeightWord(")", -1.0)); + } +} + + +//***************************************************************************** +// Report an error. Since we don' know if we are running as a CGI or not, +// we will assume this is the first thing returned by a CGI program. +// +void +reportError(char *msg) +{ + HtConfiguration* config= HtConfiguration::config(); + cout << "Content-type: text/html\r\n\r\n"; + cout << "<html><head><title>htsearch error</title></head>\n"; + cout << "<body bgcolor=\"#ffffff\">\n"; + cout << "<h1>ht://Dig error</h1>\n"; + cout << "<p>htsearch detected an error. Please report this to the\n"; + cout << "webmaster of this site by sending an e-mail to:\n"; + cout << "<a href=\"mailto:" << config->Find("maintainer") << "\">"; + cout << config->Find("maintainer") << "</a>\n"; + cout << "The error message is:</p>\n"; + cout << "<pre>\n" << msg << "\n</pre>\n</body></html>\n"; + exit(1); +} + +//***************************************************************************** +// void usage() +// Display program usage information--assumes we're running from a cmd line +// +void usage() +{ + cout << "usage: htsearch [-v][-d][-c configfile] [query_string]\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Options:\n"; + cout << "\t-v -d\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes. The default verbose mode\n"; + cout << "\t\tgives a progress on what it is doing and where it is.\n\n"; + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead on the\n"; + cout << "\t\tdefault.\n\n"; + cout << "\tquery_string\tA CGI-style query string can be given as a single\n"; + cout << "\t\targument, and is only used if the REQUEST_METHOD environment\n"; + cout << "\t\tvariable is not set. If no query_string is given, and\n"; + cout << "\t\tREQUEST_METHOD is not set, htsearch will prompt for the query.\n\n"; + exit(0); +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.h b/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.h new file mode 100644 index 00000000..59133e38 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/htsearch.h @@ -0,0 +1,71 @@ +// +// htsearch.h +// +// htsearch: The main search CGI. Parses the CGI input, reads the config files +// and calls the necessary code to put together the result lists +// and the final display. +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: htsearch.h,v 1.16 2004/05/28 13:15:25 lha Exp $ +// + +#ifndef _htsearch_h_ +#define _htsearch_h_ + +#include "List.h" +#include "StringList.h" +#include "Dictionary.h" +#include "DocumentRef.h" +#include "Database.h" +#include "good_strtok.h" +#include "DocumentDB.h" +#include "htString.h" +#include "HtConfiguration.h" +#include "ResultMatch.h" +#include "ResultList.h" +#include "HtWordReference.h" +#include "StringMatch.h" +#include "defaults.h" + +#include <stdio.h> +#include <stdlib.h> + +#ifdef HAVE_STD +#include <fstream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <fstream.h> +#endif /* HAVE_STD */ + +#ifndef _MSC_VER /* _WIN32 */ +#include <unistd.h> +#endif + +extern int n_matches; +extern int do_and; +extern int do_short; +extern StringList fields; + +#ifndef _MSC_VER /* _WIN32 */ +extern StringMatch limit_to; +#endif + +extern StringMatch URLimage; +extern List URLimageList; +extern StringMatch wm; +extern Database *dbf; +extern String logicalWords; +extern String originalWords; +extern int debug; +extern StringList collectionList; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc new file mode 100644 index 00000000..3ed1531c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/parser.cc @@ -0,0 +1,918 @@ +// +// parser.cc +// +// parser: Parses a boolean expression tree, retrieving and scoring +// the resulting document list +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: parser.cc,v 1.36 2004/06/11 16:50:33 grdetil Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include "parser.h" +#include "HtPack.h" +#include "Collection.h" +#include "Dictionary.h" +#include "QuotedStringList.h" + +#define WORD 1000 +#define DONE 1001 + +QuotedStringList boolean_syntax_errors; +enum ErrorIndices { EXPECTED, SEARCH_WORD, AT_END, INSTEAD_OF, END_OF_EXPR, QUOTE }; + +//***************************************************************************** +Parser::Parser() : + words(*(HtConfiguration::config())) +{ + tokens = 0; + result = 0; + current = 0; + valid = 1; +} + + +//***************************************************************************** +// int Parser::checkSyntax(List *tokenList) +// As the name of the function implies, we will only perform a syntax check +// on the list of tokens. +// +int +Parser::checkSyntax(List *tokenList) +{ + HtConfiguration* config= HtConfiguration::config(); + void reportError(char *); + // Load boolean_syntax_errors from configuration + // they should be placed in this order: + // 0 1 2 3 4 + // Expected "a search word" "at the end" "instead of" "end of expression" + // 5 + // "a closing quote" + boolean_syntax_errors.Destroy(); + boolean_syntax_errors.Create(config->Find("boolean_syntax_errors"), "| \t\r\n\001"); + if (boolean_syntax_errors.Count() == 5) + { // for backward compatibility + boolean_syntax_errors.Add (new String ("a closing quote")); + if (debug) + cerr << "Parser::checkSyntax() : boolean_syntax_errors should have six entries\n"; + } else if (boolean_syntax_errors.Count() != 6) + reportError("boolean_syntax_errors attribute should have six entries"); + tokens = tokenList; + valid = 1; + fullexpr(0); + return valid; +} + +//***************************************************************************** +/* Called by: Parser::parse(List*, ResultList&), checkSyntax(List*) */ +/* Inputs: output -- if zero, simply check syntax */ +/* otherwise, list matching documents in head of "stack" */ +void +Parser::fullexpr(int output) +{ + tokens->Start_Get(); + lookahead = lexan(); + expr(output); + if (valid && lookahead != DONE) + { + setError(boolean_syntax_errors[END_OF_EXPR]); + } +} + +//***************************************************************************** +int +Parser::lexan() +{ + current = (WeightWord *) tokens->Get_Next(); + if (!current) + return DONE; + else if (mystrcasecmp((char*)current->word, "&") == 0) + return '&'; + else if (mystrcasecmp((char*)current->word, "|") == 0) + return '|'; + else if (mystrcasecmp((char*)current->word, "!") == 0) + return '!'; + else if (mystrcasecmp((char*)current->word, "(") == 0) + return '('; + else if (mystrcasecmp((char*)current->word, ")") == 0) + return ')'; + else if (mystrcasecmp((char*)current->word, "\"") == 0) + return '"'; + else + return WORD; +} + +//***************************************************************************** +// Attempt to deal with expressions in the form +// term | term | term ... +/* Called by: Parser::fullexpr(int), factor(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::expr(int output) +{ + term(output); + while (1) + { + if (match('|')) + { + term(output); + if (output) + { + if(debug) cerr << "or--" << endl; + perform_or(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else + break; + } + if (valid && lookahead == WORD) + { + String expected = "'"; + expected << boolean_keywords[AND] << "' "<< boolean_keywords[OR] <<" '" + << boolean_keywords[OR] << "'"; + setError(expected.get()); + } +} + +//***************************************************************************** +// Attempt to deal with terms in the form +// factor & factor & factor ... +/* Called by: Parser::expr(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::term(int output) +{ + + factor(output); + if(debug) cerr << "term:factor" << endl; + while (1) + { + if(match('&')) + { + factor(output); + if(output) + { + if(debug) cerr << "and--" << endl; + perform_and(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else if(match('!')) + { + factor(output); + if(output) + { + if(debug) cerr << "not--" << endl; + perform_not(); + if(debug) cerr << "stack:" << stack.Size() << endl; + } + } + else + { + break; + } + } +} + +//***************************************************************************** +/* Gather and score a (possibly bracketed) boolean expression */ +/* Called by: Parser::term(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::factor(int output) +{ + if(match('"')) + { + phrase(output); + } + else if (match('(')) + { + expr(output); + if (match(')')) + { + return; + } + else + { + setError("')'"); + } + } + else if (lookahead == WORD) + { + if (output) + { + perform_push(); + } + lookahead = lexan(); + } + else + { + setError(boolean_syntax_errors[SEARCH_WORD]); +// setError("a search word, a quoted phrase, a boolean expression between ()"); + } +} + +//***************************************************************************** +/* Gather and score a quoted phrase */ +/* Called by: Parser::factor(int) */ +/* Inputs: output -- if zero, simply check syntax */ +void +Parser::phrase(int output) +{ + List *wordList = 0; + double weight = 1.0; + + while (1) + { + if (match('"')) + { + if (output) + { + if(!wordList) wordList = new List; + if(debug) cerr << "scoring phrase" << endl; + score(wordList, weight, FLAGS_MATCH_ONE); // look in all fields + } + break; + } + else if (lookahead == WORD) + { + weight *= current->weight; + if (output) + perform_phrase(wordList); + + lookahead = lexan(); + } + else if (lookahead == DONE) + { + setError(boolean_syntax_errors[QUOTE]); + break; + } + else + { + // skip '&' '|' and '!' in the phrase + current->isIgnore = 1; + if (output) + perform_phrase(wordList); + lookahead = lexan (); + } + } // end while + if(wordList) delete wordList; +} + +//***************************************************************************** +int +Parser::match(int t) +{ + if (lookahead == t) + { + lookahead = lexan(); + return 1; + } + else + return 0; +} + +//***************************************************************************** +void +Parser::setError(char *expected) +{ + if (valid) + { + valid = 0; + error = 0; + error << boolean_syntax_errors[EXPECTED] << ' ' << expected; + if (lookahead == DONE || !current) + { + error << ' ' << boolean_syntax_errors[AT_END]; + } + else + { + error << ' ' << boolean_syntax_errors[INSTEAD_OF] << " '" + << current->word.get() << "'"; + switch (lookahead) + { + case '&': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[AND] << "'"; + break; + case '|': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[OR] << "'"; + break; + case '!': error << ' ' << boolean_keywords[OR] << " '" + << boolean_keywords[NOT] << "'"; + break; + } + } + if (debug) cerr << "Syntax error: " << error << endl; + } +} + +//***************************************************************************** +// Perform a lookup of the current word and push the result onto the stack +// +void +Parser::perform_push() +{ + HtConfiguration* config= HtConfiguration::config(); + static int maximum_word_length = config->Value("maximum_word_length", 12); + String temp = current->word.get(); + char *p; + + if(debug) + cerr << "perform_push @"<< stack.Size() << ": " << temp << endl; + + String wildcard = config->Find("prefix_match_character"); + if (!wildcard.get()) + wildcard = "*"; + if (temp == wildcard) + { + if (debug) cerr << "Wild card search\n"; + ResultList *list = new ResultList; + String doc_db = config->Find("doc_db"); + DocumentDB docdb; + docdb.Read(doc_db); + List *docs = docdb.DocIDs(); + + // + // Traverse all the known documents + // + DocumentRef *ref; + IntObject *id; + DocMatch *dm; + docs->Start_Get(); + while ((id = (IntObject *) docs->Get_Next())) + { + ref = docdb[id->Value()]; + if (debug) + cerr << (ref ? "Wildcard match" : "Wildcard empty") << endl; + if (ref) + { + dm = new DocMatch; + dm->score = current->weight; + dm->id = ref->DocID(); + dm->orMatches = 1; + dm->anchor = 0; + list->add(dm); + } + delete ref; + } + delete docs; + stack.push(list); + + return; + } + + // Must be after wildcard: "*" is "isIgnore" because it is too short. + if (current->isIgnore) + { + if(debug) cerr << "ignore: " << temp << " @" << stack.Size() << endl; + // + // This word needs to be ignored. Make it so. + // + ResultList *list = new ResultList; + list->isIgnore = 1; + stack.push(list); + return; + } + + temp.lowercase(); + p = temp.get(); + if (temp.length() > maximum_word_length) + p[maximum_word_length] = '\0'; + + List* result = words[p]; + score(result, current->weight, current->flags); + delete result; +} + +//***************************************************************************** +// BUG: Phrases containing "bad words" can have *any* "bad word" in that +// position. Words less than minimum_word_length ignored entirely, +// as they are not indexed. +void +Parser::perform_phrase(List * &oldWords) +{ + HtConfiguration* config= HtConfiguration::config(); + static int maximum_word_length = config->Value("maximum_word_length", 12); + String temp = current->word.get(); + char *p; + List *newWords = 0; + HtWordReference *oldWord, *newWord; + + // how many words ignored since last checked word? + static int ignoredWords = 0; + + // if the query is empty, no further effort is needed + if(oldWords && oldWords->Count() == 0) + { + if(debug) cerr << "phrase not found, skip" << endl; + return; + } + + if(debug) cerr << "phrase current: " << temp << endl; + if (current->isIgnore) + { + // + // This word needs to be ignored. Make it so. + // + if (temp.length() >= config->Value ("minimum_word_length") && oldWords) + ignoredWords++; + if(debug) cerr << "ignoring: " << temp << endl; + return; + } + + temp.lowercase(); + p = temp.get(); + if (temp.length() > maximum_word_length) + p[maximum_word_length] = '\0'; + + newWords = words[p]; + if(debug) cerr << "new words count: " << newWords->Count() << endl; + + // If we don't have a prior list of words, we want this one... + if (!oldWords) + { + oldWords = new List; + if(debug) cerr << "phrase adding first: " << temp << endl; + newWords->Start_Get(); + while ((newWord = (HtWordReference *) newWords->Get_Next())) + { + oldWords->Add(newWord); + } + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + return; + } + + // OK, now we have a previous list in wordList and a new list + List *results = new List; + + Dictionary newDict(5000); + + String nid; + newWords->Start_Get(); + while ((newWord = (HtWordReference *) newWords->Get_Next())) + { + nid = ""; + int did = newWord->DocID(); + nid << did; + nid << "-"; + int loc = newWord->Location(); + nid << loc; + if (! newDict.Exists(nid)) { + newDict.Add(nid, (Object *)newWord); + } else { +// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n"; +// Double addition is a problem if you don't want your original objects deleted + } + } + + String oid; + oldWords->Start_Get(); + while ((oldWord = (HtWordReference *) oldWords->Get_Next())) + { + oid = ""; + int did = oldWord->DocID(); + oid << did; + oid << "-"; + int loc = oldWord->Location(); + oid << loc + ignoredWords+1; + if (newDict.Exists(oid)) + { + newWord = (HtWordReference *)newDict.Find(oid); + + HtWordReference *result = new HtWordReference(*oldWord); + + result->Flags(oldWord->Flags() & newWord->Flags()); + result->Location(newWord->Location()); + + results->Add(result); + } + } + ignoredWords = 0; // most recent word is not a non-ignored word + + newDict.Release(); + + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + if(debug) cerr << "results count: " << results->Count() << endl; + oldWords->Destroy(); + results->Start_Get(); + while ((newWord = (HtWordReference *) results->Get_Next())) + { + oldWords->Add(newWord); + } + if(debug) cerr << "old words count: " << oldWords->Count() << endl; + results->Release(); + delete results; + + newWords->Destroy(); + delete newWords; + +} + +//***************************************************************************** +// Allocate scores based on words in wordList. +// Fields within which the word must appear are specified in flags +// (see HtWordReference.h). +void +Parser::score(List *wordList, double weight, unsigned int flags) +{ + HtConfiguration* config= HtConfiguration::config(); + DocMatch *dm; + HtWordReference *wr; + static double text_factor = config->Double("text_factor", 1); + static double caps_factor = config->Double("caps_factor", 1); + static double title_factor = config->Double("title_factor", 1); + static double heading_factor = config->Double("heading_factor", 1); + static double keywords_factor = config->Double("keywords_factor", 1); + static double meta_description_factor = config->Double("meta_description_factor", 1); + static double author_factor = config->Double("author_factor", 1); + static double description_factor = config->Double("description_factor", 1); + double wscore; + int docanchor; + int word_count; + + if (!wordList || wordList->Count() == 0) + { + // We can't score an empty list, so push a null pointer... + if(debug) cerr << "score: empty list, push 0 @" << stack.Size() << endl; + + stack.push(0); + return; + } + + ResultList *list = new ResultList; + if(debug) cerr << "score: push @" << stack.Size() << endl; + stack.push(list); + // We're now guaranteed to have a non-empty list + // We'll use the number of occurences of this word for scoring + word_count = wordList->Count(); + + wordList->Start_Get(); + while ((wr = (HtWordReference *) wordList->Get_Next())) + { + // + // ******* Compute the score for the document + // + + // If word not in one of the required fields, skip the entry. + // Plain text sets no flag in dbase, so treat it separately. + if (!(wr->Flags() & flags) && (wr->Flags() || !(flags & FLAG_PLAIN))) + { + if (debug > 2) + cerr << "Flags " << wr->Flags() << " lack " << flags << endl; + continue; + } + + wscore = 0.0; + if (wr->Flags() == FLAG_TEXT) wscore += text_factor; + if (wr->Flags() & FLAG_CAPITAL) wscore += caps_factor; + if (wr->Flags() & FLAG_TITLE) wscore += title_factor; + if (wr->Flags() & FLAG_HEADING) wscore += heading_factor; + if (wr->Flags() & FLAG_KEYWORDS) wscore += keywords_factor; + if (wr->Flags() & FLAG_DESCRIPTION) wscore += meta_description_factor; + if (wr->Flags() & FLAG_AUTHOR) wscore += author_factor; + if (wr->Flags() & FLAG_LINK_TEXT) wscore += description_factor; + wscore *= weight; + wscore = wscore / (double)word_count; + docanchor = wr->Anchor(); + dm = list->find(wr->DocID()); + if (dm) + { + wscore += dm->score; + if (dm->anchor < docanchor) + docanchor = dm->anchor; + // We wish to *update* this, not add a duplicate + list->remove(wr->DocID()); + } + + dm = new DocMatch; + dm->id = wr->DocID(); + dm->score = wscore; + dm->orMatches = 1; // how many "OR" terms this doc has + dm->anchor = docanchor; + list->add(dm); + } +} + + +//***************************************************************************** +// The top two entries in the stack need to be ANDed together. +// +// a b a and b +// 0 0 0 +// 0 1 0 +// 0 x 0 +// 1 0 0 +// 1 1 intersect(a,b) +// 1 x a +// x 0 0 +// x 1 b +// x x x +// +void +Parser::perform_and() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *l2 = (ResultList *) stack.pop(); + int i; + DocMatch *dm, *dm2, *dm3; + HtVector *elements; + + if(!(l2 && l1)) + { + if(debug) cerr << "and: at least one empty operator, pushing 0 @" << stack.Size() << endl; + stack.push(0); + if(l1) delete l1; + if(l2) delete l2; + return; + } + + // + // If either of the arguments is set to be ignored, we will use the + // other as the result. + // remember l2 and l1, l2 not l1 + + if (l1->isIgnore && l2->isIgnore) + { + if(debug) cerr << "and: ignoring all, pushing ignored list @" << stack.Size() << endl; + ResultList *result = new ResultList; + result->isIgnore = 1; + delete l1; delete l2; + stack.push(result); + return; + } + else if (l1->isIgnore) + { + if(debug) cerr << "and: ignoring l1, pushing l2 @" << stack.Size() << endl; + stack.push(l2); + delete l1; + return; + } + else if (l2->isIgnore) + { + if(debug) cerr << "and: ignoring l2, pushing l2 @" << stack.Size() << endl; + stack.push(l1); + delete l2; + return; + } + + ResultList *result = new ResultList; + stack.push(result); + elements = l2->elements(); + + if(debug) + cerr << "perform and: " << elements->Count() << " " << l1->elements()->Count() << " "; + + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = l1->find(dm->id); + if (dm2) + { + // + // Duplicate document. Add scores and average "OR-matches" count + // + dm3 = new DocMatch; +// "if (dm2)" means "?:" operator not needed... +// dm3->score = dm->score + (dm2 ? dm2->score : 0); +// dm3->orMatches = (dm->orMatches + (dm2 ? dm2->orMatches : 0))/2; + dm3->score = dm->score + dm2->score; + dm3->orMatches = (dm->orMatches + dm2->orMatches)/2; + dm3->id = dm->id; + dm3->anchor = dm->anchor; +// if (dm2 && dm2->anchor < dm3->anchor) + if (dm2->anchor < dm3->anchor) + dm3->anchor = dm2->anchor; + result->add(dm3); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + + elements->Release(); + delete elements; + delete l1; + delete l2; +} + +// a b a not b +// 0 0 0 +// 0 1 0 +// 0 x 0 +// 1 0 a +// 1 1 intersect(a,not b) +// 1 x a +// x 0 x +// x 1 x +// x x x +void +Parser::perform_not() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *l2 = (ResultList *) stack.pop(); + int i; + DocMatch *dm, *dm2, *dm3; + HtVector *elements; + + + if(!l2) + { + if(debug) cerr << "not: no positive term, pushing 0 @" << stack.Size() << endl; + // Should probably be interpreted as "* not l1" + stack.push(0); + if(l1) delete l1; + return; + } + if(!l1 || l1->isIgnore || l2->isIgnore) + { + if(debug) cerr << "not: no negative term, pushing positive @" << stack.Size() << endl; + stack.push(l2); + if(l1) delete l1; + return; + } + + ResultList *result = new ResultList; + if(debug) cerr << "not: pushing result @" << stack.Size() << endl; + stack.push(result); + elements = l2->elements(); + + if(debug) + cerr << "perform not: " << elements->Count() << " " << l1->elements()->Count() << " "; + + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = l1->find(dm->id); + if (!dm2) + { + // + // Duplicate document. + // + dm3 = new DocMatch; + dm3->score = dm->score; + dm3->orMatches = dm->orMatches; + dm3->id = dm->id; + dm3->anchor = dm->anchor; + result->add(dm3); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + + elements->Release(); + delete elements; + delete l1; + delete l2; +} + +//***************************************************************************** +// The top two entries in the stack need to be ORed together. +// +void +Parser::perform_or() +{ + ResultList *l1 = (ResultList *) stack.pop(); + ResultList *result = (ResultList *) stack.peek(); + int i; + DocMatch *dm, *dm2; + HtVector *elements; + + // + // If either of the arguments is not present, we will use the other as + // the results. + // + if (!l1 && result) + { + if(debug) cerr << "or: no 2nd operand" << endl; + return; // result in top of stack + } + else if (l1 && !result) + { + if(debug) cerr << "or: no 1st operand" << endl; + stack.pop(); + stack.push(l1); + return; + } + else if (!l1 && !result) + { + if(debug) cerr << "or: no operands" << endl; + stack.pop(); + stack.push(0); // empty result + return; + } + + // + // If either of the arguments is set to be ignored, we will use the + // other as the result. + // + if (l1->isIgnore) + { + delete l1; + return; + } + else if (result->isIgnore) + { + result = (ResultList *) stack.pop(); + stack.push(l1); + delete result; + return; + } + + elements = l1->elements(); + if(debug) + cerr << "perform or: " << elements->Count() << " " << result->elements()->Count() << " "; + for (i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + dm2 = result->find(dm->id); + if (dm2) + { + // + // Update document. Add scores and add "OR-match" counts + // + dm2->score += dm->score; + dm2->orMatches += dm->orMatches; + if (dm->anchor < dm2->anchor) + dm2->anchor = dm->anchor; + } + else + { + dm2 = new DocMatch; + dm2->score = dm->score; + dm2->orMatches = dm->orMatches; + dm2->id = dm->id; + dm2->anchor = dm->anchor; + result->add(dm2); + } + } + if(debug) + cerr << result->elements()->Count() << endl; + elements->Release(); + delete elements; + delete l1; +} + +//***************************************************************************** +// void Parser::parse(List *tokenList, ResultList &resultMatches) +// +void +Parser::parse(List *tokenList, ResultList &resultMatches) +{ + HtConfiguration* config= HtConfiguration::config(); + tokens = tokenList; + DocumentRef *ref = NULL; + + fullexpr(1); + + ResultList *result = (ResultList *) stack.pop(); + if (!result) // Ouch! + { +// It seems we now end up here on a syntax error, so don't clear anything! +// valid = 0; +// error = 0; +// error << "Expected to have something to parse!"; + return; + } + HtVector *elements = result->elements(); + DocMatch *dm; + + // multimatch_factor gives extra weight to matching documents which + // contain more than one "OR" term. This is applied after the whole + // document is parsed, so multiple matches don't give exponentially + // increasing weights + double multimatch_factor = config->Double("multimatch_factor"); + + for (int i = 0; i < elements->Count(); i++) + { + dm = (DocMatch *) (*elements)[i]; + ref = collection->getDocumentRef(dm->GetId()); + if(ref && ref->DocState() == Reference_normal) + { + dm->collection = collection; // back reference + if (dm->orMatches > 1) + dm->score *= 1+multimatch_factor; + resultMatches.add(dm); + } + } + elements->Release(); + result->Release(); + delete elements; + delete result; +} + +void +Parser::setCollection(Collection *coll) +{ + if (coll) + words.Open(coll->getWordFile(), O_RDONLY); + collection = coll; +} + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/parser.h b/debian/htdig/htdig-3.2.0b6/htsearch/parser.h new file mode 100644 index 00000000..8f510d8c --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/parser.h @@ -0,0 +1,78 @@ +// +// parser.h +// +// parser: Parses a boolean expression tree, retrieving and scoring +// the resulting document list +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: parser.h,v 1.18 2004/05/28 13:15:25 lha Exp $ +// + +#ifndef _parser_h_ +#define _parser_h_ + +#include "htsearch.h" +#include "WeightWord.h" +#include "ResultList.h" +#include "DocMatch.h" +#include "Database.h" +#include "htString.h" +#include "Stack.h" +#include "HtWordList.h" +#include <ctype.h> + +class Collection; + +class Parser +{ +public: + Parser(); + + int checkSyntax(List *); + void parse(List *, ResultList &); + + // void setDatabase(const String& db) { words.Open(db, O_RDONLY); } + void setCollection(Collection *collection); + char *getErrorMessage() {return error.get();} + int hadError() {return valid == 0;} + +protected: + void fullexpr(int); + int lexan(); + void phrase(int); + void expr(int); + void term(int); + void factor(int); + int match(int); + void setError(char *); + void perform_push(); + void perform_and(); + void perform_not(); + void perform_or(); + void perform_phrase(List * &); + + void score(List *, double weight, unsigned int flags); + + List *tokens; + List *result; + WeightWord *current; + int lookahead; + int valid; + Stack stack; + String error; + Collection *collection; // Multiple database support + + HtWordList words; +}; + +extern StringList boolean_keywords; +enum KeywordIndices { AND, OR, NOT }; + +#endif + + diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/parsetest.cc b/debian/htdig/htdig-3.2.0b6/htsearch/parsetest.cc new file mode 100644 index 00000000..63165377 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/parsetest.cc @@ -0,0 +1,175 @@ +// +// parsetest.cc +// +// parsetest: A program to test the ParseTree classes as replacement for the current +// parsing code +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: parsetest.cc,v 1.4 2004/05/28 13:15:25 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> + +#include "cgi.h" +#include "defaults.h" +#include "ParseTree.h" +#include "AndParseTree.h" +#include "OrParseTree.h" +#include "ExactParseTree.h" +#include "WordContext.h" + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif + +void reportError(char *msg); +void usage(); + +int debug = 0; + +//***************************************************************************** +// int main() +// +int +main(int ac, char **av) +{ + int c; + extern char *optarg; + int override_config=0; + List *searchWords = NULL; + String configFile = DEFAULT_CONFIG_FILE; + String logicalWords; + + // + // Parse command line arguments + // + while ((c = getopt(ac, av, "c:dv")) != -1) + { + switch (c) + { + case 'c': + configFile = optarg; + override_config = 1; + break; + case 'v': + debug++; + break; + case 'd': + debug++; + break; + case '?': + usage(); + break; + } + } + + // + // Parse the CGI parameters. + // + char none[] = ""; + cgi input(optind < ac ? av[optind] : none); + + String originalWords = input["words"]; + originalWords.chop(" \t\r\n"); + + // Set up the config + config.Defaults(&defaults[0]); + + if (access((char*)configFile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + configFile.get())); + } + + config.Read(configFile); + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(config); + + ParseTree *testParse; + + testParse = new ParseTree; + if ( testParse->Parse(originalWords) != NOTOK) + { + cout << "Parsing as a boolean query... " << endl; + cout << "Initial Query:" << testParse->GetQuery() << endl; + cout << "Logical Words:" << testParse->GetLogicalWords() << endl; + } + else + cout << "Parsing as a boolean query FAILED" << endl; + delete testParse; + + testParse = new AndParseTree; + if ( testParse->Parse(originalWords) != NOTOK) + { + cout << "Parsing as an AND query... " << endl; + cout << "Initial Query:" << testParse->GetQuery() << endl; + cout << "Logical Words:" << testParse->GetLogicalWords() << endl; + } + else + cout << "Parsing as an AND query FAILED" << endl; + delete testParse; + + testParse = new OrParseTree; + if ( testParse->Parse(originalWords) != NOTOK) + { + cout << "Parsing as an OR query... " << endl; + cout << "Initial Query:" << testParse->GetQuery() << endl; + cout << "Logical Words:" << testParse->GetLogicalWords() << endl; + } + else + cout << "Parsing as an OR query FAILED" << endl; + delete testParse; + + testParse = new ExactParseTree; + if ( testParse->Parse(originalWords) != NOTOK) + { + cout << "Parsing as an EXACT query... " << endl; + cout << "Initial Query:" << testParse->GetQuery() << endl; + cout << "Logical Words:" << testParse->GetLogicalWords() << endl; + } + else + cout << "Parsing as an EXACT query FAILED" << endl; + delete testParse; + +} + +//***************************************************************************** +// void usage() +// Display program usage information--assumes we're running from a cmd line +// +void usage() +{ + cout << "usage: parsetest [-v][-d][-c configfile]\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Options:\n"; + cout << "\t-v -d\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes. The default verbose mode\n"; + cout << "\t\tgives a progress on what it is doing and where it is.\n\n"; + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead on the\n"; + cout << "\t\tdefault.\n\n"; + exit(0); +} + +//***************************************************************************** +// Report an error and die +// +void reportError(char *msg) +{ + cout << "parsetest: " << msg << "\n\n"; + exit(1); +} diff --git a/debian/htdig/htdig-3.2.0b6/htsearch/qtest.cc b/debian/htdig/htdig-3.2.0b6/htsearch/qtest.cc new file mode 100644 index 00000000..36a6d8c7 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/htsearch/qtest.cc @@ -0,0 +1,252 @@ +// +// qtest.cc +// +// qtest: A program to test the Query classes as replacement for the current +// parsing code +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1999-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: qtest.cc,v 1.5 2004/05/28 13:15:25 lha Exp $ +// + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> + +#include "cgi.h" +#include "defaults.h" +#include "WordContext.h" + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +#include "QueryParser.h" +#include "Query.h" +#include "ResultList.h" +#include "Exact.h" +#include "Accents.h" +#include "Prefix.h" +#include "WordSearcher.h" +#include "OrFuzzyExpander.h" +#include "ExactWordQuery.h" +#include "OrQueryParser.h" +#include "AndQueryParser.h" +#include "BooleanQueryParser.h" +#include "GParser.h" + +// If we have this, we probably want it. +#ifdef HAVE_GETOPT_H +#include <getopt.h> +#endif + +void reportError(char *msg); +void usage(); + +int debug = 0; + +void +ParseAndGet(QueryParser &parser, const String &string); + +//***************************************************************************** +// int main() +// +int +main(int ac, char **av) +{ + int c; + extern char *optarg; + int override_config=0; + String configFile = DEFAULT_CONFIG_FILE; + String logicalWords; + bool doall = true, + doand = false, + door = false, + dobool = false, + dogeoffs = false; + + // + // Parse command line arguments + // + while ((c = getopt(ac, av, "c:dvkaobg")) != -1) + { + switch (c) + { + case 'c': + configFile = optarg; + override_config = 1; + break; + case 'v': + debug++; + break; + case 'd': + debug++; + break; + case 'a': + doall = false; + doand = true; + break; + case 'o': + doall = false; + door = true; + break; + case 'b': + doall = false; + dobool = true; + break; + case 'g': + doall = false; + dogeoffs = true; + break; + case '?': + usage(); + break; + } + } + + // + // Parse the CGI parameters. + // + char none[] = ""; + cgi input(optind < ac ? av[optind] : none); + + String originalWords = input["words"]; + originalWords.chop(" \t\r\n"); + + HtConfiguration* config= HtConfiguration::config(); + // Set up the config + config->Defaults(&defaults[0]); + + if (access((char*)configFile, R_OK) < 0) + { + reportError(form("Unable to find configuration file '%s'", + configFile.get())); + } + + config->Read(configFile); + + // Initialize htword library (key description + wordtype...) + WordContext::Initialize(*config); + + OrFuzzyExpander exp; + Exact exact(*config); + exact.setWeight(1.0); + exact.openIndex(); + exp.Add(&exact); + Accents accents(*config); + accents.setWeight(0.7); + accents.openIndex(); + exp.Add(&accents); + Prefix prefix(*config); + prefix.setWeight(0.7); + prefix.openIndex(); + exp.Add(&prefix); + QueryParser::SetFuzzyExpander(&exp); + + WordSearcher searcher(config->Find("word_db")); + ExactWordQuery::SetSearcher(&searcher); + + // -- put here your prefered cache + //QueryCache *cache = new XXX; + //Query::SetCache(cache); + + OrQueryParser o; + BooleanQueryParser b; + GParser g; + AndQueryParser a; + + if(doall || doand) + { + cout << "Trying and..." << endl; + ParseAndGet(a, originalWords); + } + + if(doall || door) + { + cout << "Trying or..." << endl; + ParseAndGet(o, originalWords); + } + + if(doall || dobool) + { + cout << "Trying boolean..." << endl; + ParseAndGet(b, originalWords); + } + + if(doall || dogeoffs) + { + cout << "Trying no-precedence-boolean..." << endl; + ParseAndGet(g, originalWords); + } +} + +void +ParseAndGet(QueryParser &parser, const String &query) +{ + Query *q = parser.Parse(query); + if(q) + { + cout << "Parsed: " << q->GetLogicalWords() << endl; + ResultList *l = q->GetResults(); + if(l) + { + cout << "Evaluated with " << l->Count() << " matches" << endl; + if(debug) l->Dump(); + } + else + { + cout << "No matches" << endl;; + } + } + else + { + cerr << "syntax error: " << flush << parser.Error() << endl; + } + delete q; +} + + +//***************************************************************************** +// void usage() +// Display program usage information--assumes we're running from a cmd line +// +void usage() +{ + cout << "usage: qtest [-a][-o][-b][-g][-v][-d][-c configfile]\n"; + cout << "This program is part of ht://Dig " << VERSION << "\n\n"; + cout << "Options:\n"; + cout << "\t-v -d\tVerbose mode. This increases the verbosity of the\n"; + cout << "\t\tprogram. Using more than 2 is probably only useful\n"; + cout << "\t\tfor debugging purposes. The default verbose mode\n"; + cout << "\t\tgives a progress on what it is doing and where it is.\n\n"; + cout << "\t-c configfile\n"; + cout << "\t\tUse the specified configuration file instead on the\n"; + cout << "\t\tdefault.\n\n"; + cout << "\t-a\tPerform only and/all parsing\n\n"; + cout << "\t-o\tPerform only or/any parsing\n\n"; + cout << "\t-b\tPerform only boolean parsing\n\n"; + cout << "\t-g\tPerform only no-precedence-boolean parsing\n\n"; + exit(0); +} + +//***************************************************************************** +// Report an error and die +// +void reportError(char *msg) +{ + cout << "qtest: " << msg << "\n\n"; + exit(1); +} |