Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

include/om/omenquire.h

Go to the documentation of this file.
00001 
00004 /* ----START-LICENCE----
00005  * Copyright 1999,2000,2001 BrightStation PLC
00006  * Copyright 2001,2002 Ananova Ltd
00007  * Copyright 2002,2003 Olly Betts
00008  *
00009  * This program is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU General Public License as
00011  * published by the Free Software Foundation; either version 2 of the
00012  * License, or (at your option) any later version.
00013  *
00014  * This program is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00022  * USA
00023  * -----END-LICENCE-----
00024  */
00025 
00026 #ifndef OM_HGUARD_OMENQUIRE_H
00027 #define OM_HGUARD_OMENQUIRE_H
00028 
00029 #include "om/omtypes.h"
00030 #include "om/omdocument.h"
00031 #include "om/omdatabase.h"
00032 #include "om/omerror.h"
00033 #include <string>
00034 #include <time.h> // for time_t
00035 
00036 class OmQuery;
00037 class OmErrorHandler;
00038 class OmWeight;
00039 
00043 class OmMSetIterator {
00044     public:
00045         friend class OmMSet;
00046 
00047         class Internal;
00049         Internal *internal;
00050 
00051         friend bool operator==(const OmMSetIterator &a,
00052                                const OmMSetIterator &b);
00053 
00054     private:
00055         OmMSetIterator(Internal *internal_);
00056 
00057     public:
00061         OmMSetIterator();
00062 
00063         ~OmMSetIterator();
00064 
00066         OmMSetIterator(const OmMSetIterator &other);
00067 
00069         void operator=(const OmMSetIterator &other);
00070 
00072         OmMSetIterator & operator++();
00073 
00074         void operator++(int);
00075 
00077         om_docid operator *() const;
00078 
00097         OmDocument get_document() const;
00098 
00105         om_doccount get_rank() const;
00106 
00108         om_weight get_weight() const;
00109 
00139         om_doccount get_collapse_count() const;
00140 
00145         om_percent get_percent() const;
00146 
00150         std::string get_description() const;
00151 
00153 
00154         typedef std::input_iterator_tag iterator_category;
00155         typedef om_docid value_type;
00156         typedef om_doccount_diff difference_type;
00157         typedef om_docid * pointer;
00158         typedef om_docid & reference;
00160 };
00161 
00162 inline bool operator!=(const OmMSetIterator &a,
00163                        const OmMSetIterator &b)
00164 {
00165     return !(a == b);
00166 }
00167 
00171 class OmMSet {
00172     public:
00173         class Internal;
00175         Internal *internal;
00176 
00177     public:
00178         // FIXME: public for now, private would be better
00180         OmMSet(OmMSet::Internal * internal_);
00181 
00183         OmMSet();
00184 
00186         ~OmMSet();
00187 
00189         OmMSet(const OmMSet & other);
00190 
00192         void operator=(const OmMSet &other);
00193 
00209         void fetch(const OmMSetIterator &begin,
00210                    const OmMSetIterator &end) const;
00211 
00214         void fetch(const OmMSetIterator &item) const;
00215 
00218         void fetch() const;
00219 
00224         om_percent convert_to_percent(om_weight wt) const;
00225 
00227         om_percent convert_to_percent(const OmMSetIterator &it) const;
00228 
00236         om_doccount get_termfreq(const om_termname &tname) const;
00237 
00245         om_weight get_termweight(const om_termname &tname) const;
00246 
00253         om_doccount get_firstitem() const;
00254 
00261         om_doccount get_matches_lower_bound() const;
00262 
00272         om_doccount get_matches_estimated() const;
00273 
00280         om_doccount get_matches_upper_bound() const;
00281 
00287         om_weight get_max_possible() const;
00288 
00302         om_weight get_max_attained() const;
00303 
00304         om_doccount size() const;
00305 
00306         om_doccount max_size() const;
00307 
00308         bool empty() const;
00309 
00310         void swap(OmMSet & other);
00311 
00312         OmMSetIterator begin() const;
00313 
00314         OmMSetIterator end() const;
00315 
00316         OmMSetIterator back() const;
00317         
00327         OmMSetIterator operator[](om_doccount i) const;
00328 
00330 
00331         typedef std::input_iterator_tag iterator_category;
00332         typedef OmMSetIterator value_type; // FIXME: not assignable...
00333         typedef OmMSetIterator iterator;
00334         typedef OmMSetIterator const_iterator;
00335         typedef OmMSetIterator & reference; // Hmm
00336         typedef OmMSetIterator & const_reference;
00337         typedef OmMSetIterator * pointer; // Hmm
00338         typedef om_doccount_diff difference_type;
00339         typedef om_doccount size_type;
00341         
00345         std::string get_description() const;
00346 };
00347 
00349 class OmESetIterator {
00350     public:
00351         friend class OmESet;
00352         class Internal;
00354         Internal *internal;
00355 
00356         friend bool operator==(const OmESetIterator &a,
00357                                const OmESetIterator &b);
00358 
00359     private:
00360 
00361         OmESetIterator(Internal *internal_);
00362 
00363     public:
00367         OmESetIterator();
00368 
00370         ~OmESetIterator();
00371 
00373         OmESetIterator(const OmESetIterator &other);
00374 
00376         void operator=(const OmESetIterator &other);
00377 
00378         OmESetIterator & operator++();
00379 
00380         void operator++(int);
00381 
00383         const om_termname & operator *() const;
00384 
00386         om_weight get_weight() const;
00387 
00391         std::string get_description() const;
00392 
00394 
00395         typedef std::input_iterator_tag iterator_category;
00396         typedef om_termname value_type;
00397         typedef om_termcount_diff difference_type;
00398         typedef om_termname * pointer;
00399         typedef om_termname & reference;
00401 };
00402 
00403 inline bool
00404 operator!=(const OmESetIterator &a, const OmESetIterator &b)
00405 {
00406     return !(a == b);
00407 }
00408 
00413 class OmESet {
00414     public:
00415         class Internal;
00417         Internal *internal;
00418 
00420         OmESet();
00421 
00423         ~OmESet();
00424 
00426         OmESet(const OmESet & other);
00427 
00429         void operator=(const OmESet &other);
00430 
00435         om_termcount get_ebound() const;
00436 
00438         om_termcount size() const;
00439 
00441         bool empty() const;
00442 
00444         OmESetIterator begin() const;
00445 
00447         OmESetIterator end() const;
00448 
00453         std::string get_description() const;
00454 };
00455 
00460 class OmRSet {
00461     public:
00463         class Internal;
00465         Internal *internal;
00466 
00468         OmRSet(const OmRSet &rset);
00469 
00471         void operator=(const OmRSet &rset);
00472 
00474         OmRSet();
00475 
00477         ~OmRSet();
00478 
00480         om_doccount size() const;
00481 
00483         bool empty() const;
00484 
00486         void add_document(om_docid did);
00487         
00489         void add_document(const OmMSetIterator & i) { add_document(*i); }
00490 
00492         void remove_document(om_docid did);
00493 
00495         void remove_document(const OmMSetIterator & i) { remove_document(*i); }
00496 
00498         bool contains(om_docid did) const;
00499 
00501         bool contains(const OmMSetIterator & i) { return contains(*i); }
00502 
00507         std::string get_description() const;
00508 };
00509 
00512 class OmMatchDecider {
00513     public:
00516         virtual int operator()(const OmDocument &doc) const = 0;
00517 
00519         virtual ~OmMatchDecider() {}
00520 };
00521 
00524 class OmExpandDecider {
00525     public:
00528         virtual int operator()(const om_termname & tname) const = 0;
00529 
00531         virtual ~OmExpandDecider() {}
00532 };
00533 
00547 class OmEnquire {
00548     private:
00550         OmEnquire(const OmEnquire &);
00551 
00553         void operator=(const OmEnquire &);
00554 
00555     public:
00556         class Internal;
00558         Internal *internal;
00559 
00575         OmEnquire(const OmDatabase &databases,
00576                   OmErrorHandler * errorhandler_ = 0);
00577 
00587         ~OmEnquire();
00588 
00596         void set_query(const OmQuery & query_);
00597 
00604         const OmQuery & get_query();
00605 
00612         void set_weighting_scheme(const OmWeight &weight_);
00613 
00620         void set_collapse_key(om_valueno collapse_key);
00621 
00628         void set_sort_forward(bool sort_forward);
00629 
00647         void set_cutoff(int percent_cutoff, om_weight weight_cutoff = 0);
00648 
00662         void set_sorting(om_valueno sort_key, int sort_bands);
00663 
00675         void set_bias(om_weight bias_weight, time_t bias_halflife);
00676 
00697         OmMSet get_mset(om_doccount first,
00698                         om_doccount maxitems,
00699                         const OmRSet * omrset = 0,
00700                         const OmMatchDecider * mdecider = 0) const;
00701 
00702         static const int include_query_terms = 1;
00703         static const int use_exact_termfreq = 2;
00727         OmESet get_eset(om_termcount maxitems,
00728                         const OmRSet & omrset,
00729                         int flags = 0,
00730                         double k = 1.0,
00731                         const OmExpandDecider * edecider = 0) const;
00732 
00747         inline OmESet get_eset(om_termcount maxitems, const OmRSet & omrset,
00748                                const OmExpandDecider * edecider) const {
00749             return get_eset(maxitems, omrset, 0, 1.0, edecider);
00750         }
00751 
00781         OmTermIterator get_matching_terms_begin(om_docid did) const;
00782 
00784         OmTermIterator get_matching_terms_end(om_docid did) const;
00785 
00809         OmTermIterator get_matching_terms_begin(const OmMSetIterator &it) const;
00810 
00812         OmTermIterator get_matching_terms_end(const OmMSetIterator &it) const;
00813 
00816         void register_match_decider(const std::string &name,
00817                                     const OmMatchDecider *mdecider = NULL);
00818 
00822         std::string get_description() const;
00823 };
00824 
00825 class SocketServer;
00826 
00828 class OmWeight {
00829     friend class OmEnquire; // So OmEnquire can clone us
00830     friend class SocketServer; // So SocketServer can clone us - FIXME
00831     public:
00832         class Internal;
00833     private:
00834         OmWeight(const OmWeight &);
00835         void operator=(OmWeight &);
00836 
00838         //
00839         // Each subclass should implement this as:
00840         // virtual OmFooWeight * clone() const {
00841         //     return new OmFooWeight(param1, param2);
00842         // }
00843         virtual OmWeight * clone() const = 0;
00844 
00845     protected:
00846         const Internal * internal; // OmWeight::Internal == StatsSource
00847         om_doclength querysize;
00848         om_termcount wqf;
00849         om_termname tname;
00850 
00851     public:
00852         OmWeight() { }
00853         virtual ~OmWeight() { }
00854 
00866         OmWeight * create(const Internal * internal_, om_doclength querysize_,
00867                           om_termcount wqf_, om_termname tname_) const {
00868             OmWeight * wt = clone();
00869             wt->internal = internal_;
00870             wt->querysize = querysize_;
00871             wt->wqf = wqf_;
00872             wt->tname = tname_;
00873             return wt;
00874         }
00875 
00877         //
00878         //  If the subclass is called FooWeight, this should return "Foo".
00879         virtual std::string name() const = 0;
00880 
00882         virtual std::string serialise() const = 0;
00883 
00885         virtual OmWeight * OmWeight::unserialise(const std::string &s) const = 0;
00886 
00894         virtual om_weight get_sumpart(om_termcount wdf,
00895                                       om_doclength len) const = 0;
00896 
00902         virtual om_weight get_maxpart() const = 0;
00903 
00912         virtual om_weight get_sumextra(om_doclength len) const = 0;
00913 
00917         virtual om_weight get_maxextra() const = 0;
00918 
00920         virtual bool get_sumpart_needs_doclength() const { return true; }
00921 };
00922 
00924 class BoolWeight : public OmWeight {
00925     public:
00926         OmWeight * clone() const {
00927             return new BoolWeight;
00928         }
00929         BoolWeight() { }
00930         ~BoolWeight() { }
00931         std::string name() const { return "Bool"; }
00932         std::string serialise() const { return ""; }
00933         OmWeight * unserialise(const std::string & /*s*/) const {
00934             return new BoolWeight;
00935         }
00936         om_weight get_sumpart(om_termcount /*wdf*/, om_doclength /*len*/) const { return 0; }
00937         om_weight get_maxpart() const { return 0; }
00938 
00939         om_weight get_sumextra(om_doclength /*len*/) const { return 0; }
00940         om_weight get_maxextra() const { return 0; }
00941 
00942         bool get_sumpart_needs_doclength() const { return false; }      
00943 };
00944 
00946 //
00947 // BM25 weighting options : The BM25 formula is \f[
00948 //      \frac{C.s_{q}}{1+L_{d}}+\sum_{t}\frac{(A+1)q_{t}}{A+q_{t}}.\frac{(B+1)f_{t,d}}{B((1-D)+DL_{d})+f_{t,d}}.w_{t}
00949 // \f] where
00950 //   - \f$w_{t}\f$ is the termweight of term t
00951 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00952 //   - \f$q_{t}\f$ is the within query frequency of term t
00953 //   - \f$L_{d}\f$ is the normalised length of document d
00954 //   - \f$s_{q}\f$ is the size of the query
00955 //   - \f$A\f$, \f$B\f$, \f$C\f$ and \f$D\f$ are user specified parameters
00956 class BM25Weight : public OmWeight {
00957     private:
00958         mutable om_weight termweight;
00959         mutable om_doclength lenpart;
00960         mutable double BD;
00961 
00962         double A, B, C, D;
00963         om_doclength min_normlen;
00964 
00965         mutable bool weight_calculated;
00966 
00967         void calc_termweight() const;
00968 
00969     public:
00988         BM25Weight(double A_, double B_, double C_, double D_,
00989                    double min_normlen_)
00990                 : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
00991                   weight_calculated(false)
00992         {
00993             if (A < 0) A = 0;
00994             if (B < 0) B = 0;
00995             if (C < 0) C = 0;
00996             if (D < 0) D = 0; else if (D > 1) D = 1;
00997         }
00998         BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
00999                        weight_calculated(false) { }
01000 
01001         OmWeight * clone() const {
01002             return new BM25Weight(A, B, C, D, min_normlen);
01003         }
01004         ~BM25Weight() { }
01005         std::string name() const { return "BM25"; }
01006         std::string serialise() const;
01007         OmWeight * unserialise(const std::string & s) const;
01008         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
01009         om_weight get_maxpart() const;
01010 
01011         om_weight get_sumextra(om_doclength len) const;
01012         om_weight get_maxextra() const;
01013 
01014         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01015 };
01016 
01018 //
01019 // The Traditional weighting scheme formula is \f[
01020 //      \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
01021 // \f] where
01022 //   - \f$w_{t}\f$ is the termweight of term t
01023 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
01024 //   - \f$L_{d}\f$ is the normalised length of document d
01025 //   - \f$k\f$ is a user specifiable parameter
01026 //
01027 // TradWeight is equivalent to BM25Weight(1, 1, 0, k, 0)
01028 class TradWeight : public OmWeight {
01029     private:
01030         mutable om_weight termweight;
01031         mutable om_doclength lenpart;
01032 
01033         double param_k;
01034 
01035         mutable bool weight_calculated;
01036 
01037         void calc_termweight() const;
01038 
01039     public:
01041         //
01042         // @param k  parameter governing the importance of within
01043         //           document frequency and document length - any positive
01044         //           number, 0 being wdf and doc length not used.  Default
01045         //           is 1.
01046         TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
01047             if (param_k < 0) param_k = 0;
01048         }
01049         OmWeight * clone() const {
01050             return new TradWeight(param_k);
01051         }
01052         ~TradWeight() { }
01053         std::string name() const { return "Trad"; }
01054         std::string serialise() const;
01055         OmWeight * unserialise(const std::string & s) const;
01056         
01057         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
01058         om_weight get_maxpart() const;
01059 
01060         om_weight get_sumextra(om_doclength len) const;
01061         om_weight get_maxextra() const;
01062 
01063         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01064 };
01065 
01066 #endif /* OM_HGUARD_OMENQUIRE_H */

Documentation for Xapian (version 0.6.4).
Generated on 10 Apr 2003 by Doxygen 1.2.15.