Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

include/om/omenquire.h

Go to the documentation of this file.
00001 
00004 /* ----START-LICENCE----
00005  * Copyright 1999,2000,2001 BrightStation PLC
00006  * Copyright 2001,2002 Ananova Ltd
00007  * Copyright 2002 Olly Betts
00008  *
00009  * This program is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU General Public License as
00011  * published by the Free Software Foundation; either version 2 of the
00012  * License, or (at your option) any later version.
00013  *
00014  * This program is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00022  * USA
00023  * -----END-LICENCE-----
00024  */
00025 
00026 #ifndef OM_HGUARD_OMENQUIRE_H
00027 #define OM_HGUARD_OMENQUIRE_H
00028 
00029 #include "om/omtypes.h"
00030 #include "om/omdocument.h"
00031 #include "om/omdatabase.h"
00032 #include "om/omerror.h"
00033 #include "om/omsettings.h"
00034 #include <string>
00035 
00036 class OmQuery;
00037 class OmErrorHandler;
00038 class OmWeight;
00039 
00043 class OmMSetIterator {
00044     public:
00045         friend class OmMSet;
00046 
00047         class Internal;
00049         Internal *internal;
00050 
00051         friend bool operator==(const OmMSetIterator &a,
00052                                const OmMSetIterator &b);
00053 
00054     private:
00055         OmMSetIterator(Internal *internal_);
00056 
00057     public:
00061         OmMSetIterator();
00062 
00063         ~OmMSetIterator();
00064 
00066         OmMSetIterator(const OmMSetIterator &other);
00067 
00069         void operator=(const OmMSetIterator &other);
00070 
00072         OmMSetIterator & operator++();
00073 
00074         void operator++(int);
00075 
00077         om_docid operator *() const;
00078 
00097         OmDocument get_document() const;
00098 
00105         om_doccount get_rank() const;
00106 
00108         om_weight get_weight() const;
00109 
00114         om_percent get_percent() const;
00115 
00119         std::string get_description() const;
00120 
00122 
00123         typedef std::input_iterator_tag iterator_category;
00124         typedef om_docid value_type;
00125         typedef om_doccount_diff difference_type;
00126         typedef om_docid * pointer;
00127         typedef om_docid & reference;
00129 };
00130 
00131 inline bool operator!=(const OmMSetIterator &a,
00132                        const OmMSetIterator &b)
00133 {
00134     return !(a == b);
00135 }
00136 
00140 class OmMSet {
00141     public:
00142         class Internal;
00144         Internal *internal;
00145 
00146     public:
00147         // FIXME: public for now, private would be better
00149         OmMSet(OmMSet::Internal * internal_);
00150 
00152         OmMSet();
00153 
00155         ~OmMSet();
00156 
00158         OmMSet(const OmMSet & other);
00159 
00161         void operator=(const OmMSet &other);
00162 
00178         void fetch(const OmMSetIterator &begin,
00179                    const OmMSetIterator &end) const;
00180 
00183         void fetch(const OmMSetIterator &item) const;
00184 
00187         void fetch() const;
00188 
00193         om_percent convert_to_percent(om_weight wt) const;
00194 
00196         om_percent convert_to_percent(const OmMSetIterator &it) const;
00197 
00205         om_doccount get_termfreq(const om_termname &tname) const;
00206 
00214         om_weight get_termweight(const om_termname &tname) const;
00215 
00222         om_doccount get_firstitem() const;
00223 
00230         om_doccount get_matches_lower_bound() const;
00231 
00241         om_doccount get_matches_estimated() const;
00242 
00249         om_doccount get_matches_upper_bound() const;
00250 
00256         om_weight get_max_possible() const;
00257 
00271         om_weight get_max_attained() const;
00272 
00273         om_doccount size() const;
00274 
00275         om_doccount max_size() const;
00276 
00277         bool empty() const;
00278 
00279         void swap(OmMSet & other);
00280 
00281         OmMSetIterator begin() const;
00282 
00283         OmMSetIterator end() const;
00284 
00285         OmMSetIterator back() const;
00286         
00296         OmMSetIterator operator[](om_doccount i) const;
00297 
00299 
00300         typedef std::input_iterator_tag iterator_category;
00301         typedef OmMSetIterator value_type; // FIXME: not assignable...
00302         typedef OmMSetIterator iterator;
00303         typedef OmMSetIterator const_iterator;
00304         typedef OmMSetIterator & reference; // Hmm
00305         typedef OmMSetIterator & const_reference;
00306         typedef OmMSetIterator * pointer; // Hmm
00307         typedef om_doccount_diff difference_type;
00308         typedef om_doccount size_type;
00310         
00314         std::string get_description() const;
00315 };
00316 
00318 class OmESetIterator {
00319     public:
00320         friend class OmESet;
00321         class Internal;
00323         Internal *internal;
00324 
00325         friend bool operator==(const OmESetIterator &a,
00326                                const OmESetIterator &b);
00327 
00328     private:
00329 
00330         OmESetIterator(Internal *internal_);
00331 
00332     public:
00336         OmESetIterator();
00337 
00339         ~OmESetIterator();
00340 
00342         OmESetIterator(const OmESetIterator &other);
00343 
00345         void operator=(const OmESetIterator &other);
00346 
00347         OmESetIterator & operator++();
00348 
00349         void operator++(int);
00350 
00352         const om_termname & operator *() const;
00353 
00355         om_weight get_weight() const;
00356 
00360         std::string get_description() const;
00361 
00363 
00364         typedef std::input_iterator_tag iterator_category;
00365         typedef om_termname value_type;
00366         typedef om_termcount_diff difference_type;
00367         typedef om_termname * pointer;
00368         typedef om_termname & reference;
00370 };
00371 
00372 inline bool
00373 operator!=(const OmESetIterator &a, const OmESetIterator &b)
00374 {
00375     return !(a == b);
00376 }
00377 
00382 class OmESet {
00383     public:
00384         class Internal;
00386         Internal *internal;
00387 
00389         OmESet();
00390 
00392         ~OmESet();
00393 
00395         OmESet(const OmESet & other);
00396 
00398         void operator=(const OmESet &other);
00399 
00404         om_termcount get_ebound() const;
00405 
00407         om_termcount size() const;
00408 
00410         bool empty() const;
00411 
00413         OmESetIterator begin() const;
00414 
00416         OmESetIterator end() const;
00417 
00422         std::string get_description() const;
00423 };
00424 
00429 class OmRSet {
00430     public:
00432         class Internal;
00434         Internal *internal;
00435 
00437         OmRSet(const OmRSet &rset);
00438 
00440         void operator=(const OmRSet &rset);
00441 
00443         OmRSet();
00444 
00446         ~OmRSet();
00447 
00449         om_doccount size() const;
00450 
00452         bool empty() const;
00453 
00455         void add_document(om_docid did);
00456         
00458         void add_document(const OmMSetIterator & i) { add_document(*i); }
00459 
00461         void remove_document(om_docid did);
00462 
00464         void remove_document(const OmMSetIterator & i) { remove_document(*i); }
00465 
00467         bool contains(om_docid did) const;
00468 
00470         bool contains(const OmMSetIterator & i) { return contains(*i); }
00471 
00476         std::string get_description() const;
00477 };
00478 
00481 class OmMatchDecider {
00482     public:
00485         virtual int operator()(const OmDocument &doc) const = 0;
00486 
00488         virtual ~OmMatchDecider() {}
00489 };
00490 
00493 class OmExpandDecider {
00494     public:
00497         virtual int operator()(const om_termname & tname) const = 0;
00498 
00500         virtual ~OmExpandDecider() {}
00501 };
00502 
00516 class OmEnquire {
00517     private:
00519         OmEnquire(const OmEnquire &);
00520 
00522         void operator=(const OmEnquire &);
00523 
00524     public:
00525         class Internal;
00527         Internal *internal;
00528 
00544         OmEnquire(const OmDatabase &databases,
00545                   OmErrorHandler * errorhandler_ = 0);
00546 
00555         ~OmEnquire();
00556 
00564         void set_query(const OmQuery & query_);
00565 
00572         const OmQuery & get_query();
00573 
00580         void set_weighting_scheme(const OmWeight &weight_);
00581 
00603         OmMSet get_mset(om_doccount first,
00604                         om_doccount maxitems,
00605                         const OmRSet * omrset = 0,
00606                         const OmSettings * moptions = 0,
00607                         const OmMatchDecider * mdecider = 0) const;
00608 
00630         OmESet get_eset(om_termcount maxitems,
00631                         const OmRSet & omrset,
00632                         bool exclude_query_terms = true,
00633                         bool use_exact_termfreq = false,
00634                         double k = 1.0,
00635                         const OmExpandDecider * edecider = 0) const;
00636 
00651         inline OmESet get_eset(om_termcount maxitems, const OmRSet & omrset,
00652                                const OmExpandDecider * edecider) const {
00653             return get_eset(maxitems, omrset, true, false, 1.0, edecider);
00654         }
00655 
00656         // This is needed as otherwise the first overloaded method is
00657         // used (since X* matches bool...)
00658         inline OmESet get_eset(om_termcount, const OmRSet &, OmSettings *,
00659                                OmExpandDecider *dummy = 0) const {
00660             (void)dummy;
00661             throw "You need to update this call to OmEnquire::get_eset() "
00662                   "- it no longer takes an OmSettings * parameter";
00663         }
00664 
00694         OmTermIterator get_matching_terms_begin(om_docid did) const;
00695 
00697         OmTermIterator get_matching_terms_end(om_docid did) const;
00698 
00722         OmTermIterator get_matching_terms_begin(const OmMSetIterator &it) const;
00723 
00725         OmTermIterator get_matching_terms_end(const OmMSetIterator &it) const;
00726 
00729         void register_match_decider(const std::string &name,
00730                                     const OmMatchDecider *mdecider = NULL);
00731 
00735         std::string get_description() const;
00736 };
00737 
00739 class OmWeight {
00740     friend class OmEnquire; // So OmEnquire can clone us
00741     public:
00742         class Internal;
00743     private:
00744         OmWeight(const OmWeight &);
00745         void operator=(OmWeight &);
00746 
00748         //
00749         // Each subclass should implement this as:
00750         // virtual OmFooWeight * clone() const {
00751         //     return new OmFooWeight(param1, param2);
00752         // }
00753         virtual OmWeight * clone() const = 0;
00754 
00755     protected:
00756         const Internal * internal; // OmWeight::Internal == StatsSource
00757         om_doclength querysize;
00758         om_termcount wqf;
00759         om_termname tname;
00760 
00761     public:
00762         OmWeight() { }
00763         virtual ~OmWeight() { }
00764 
00775         OmWeight * create(const Internal * internal_, om_doclength querysize_,
00776                           om_termcount wqf_, om_termname tname_) {
00777             OmWeight * wt = clone();
00778             wt->internal = internal_;
00779             wt->querysize = querysize_;
00780             wt->wqf = wqf_;
00781             wt->tname = tname_;
00782             return wt;
00783         }
00784 
00792         virtual om_weight get_sumpart(om_termcount wdf,
00793                                       om_doclength len) const = 0;
00794 
00800         virtual om_weight get_maxpart() const = 0;
00801 
00810         virtual om_weight get_sumextra(om_doclength len) const = 0;
00811 
00815         virtual om_weight get_maxextra() const = 0;
00816 
00818         virtual bool get_sumpart_needs_doclength() const { return true; }
00819 };
00820 
00822 class BoolWeight : public OmWeight {
00823     public:
00824         OmWeight * clone() const {
00825             return new BoolWeight;
00826         }
00827         BoolWeight() { }
00828         ~BoolWeight() { }
00829         om_weight get_sumpart(om_termcount /*wdf*/, om_doclength /*len*/) const { return 0; }
00830         om_weight get_maxpart() const { return 0; }
00831 
00832         om_weight get_sumextra(om_doclength /*len*/) const { return 0; }
00833         om_weight get_maxextra() const { return 0; }
00834 
00835         bool get_sumpart_needs_doclength() const { return false; }      
00836 };
00837 
00839 //
00840 // BM25 weighting options : The BM25 formula is \f[
00841 //      \frac{C.s_{q}}{1+L_{d}}+\sum_{t}\frac{(A+1)q_{t}}{A+q_{t}}.\frac{(B+1)f_{t,d}}{B((1-D)+DL_{d})+f_{t,d}}.w_{t}
00842 // \f] where
00843 //   - \f$w_{t}\f$ is the termweight of term t
00844 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00845 //   - \f$q_{t}\f$ is the within query frequency of term t
00846 //   - \f$L_{d}\f$ is the normalised length of document d
00847 //   - \f$s_{q}\f$ is the size of the query
00848 //   - \f$A\f$, \f$B\f$, \f$C\f$ and \f$D\f$ are user specified parameters
00849 class BM25Weight : public OmWeight {
00850     private:
00851         mutable om_weight termweight;
00852         mutable om_doclength lenpart;
00853         mutable double BD;
00854 
00855         double A, B, C, D;
00856         om_doclength min_normlen;
00857 
00858         mutable bool weight_calculated;
00859 
00860         void calc_termweight() const;
00861 
00862     public:
00881         BM25Weight(double A_, double B_, double C_, double D_,
00882                    double min_normlen_)
00883                 : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
00884                   weight_calculated(false)
00885         {
00886             if (A < 0) throw OmInvalidArgumentError("Parameter A in BM25 weighting formula must be >= 0");
00887             if (B < 0) throw OmInvalidArgumentError("Parameter B in BM25 weighting formula must be >= 0");
00888             if (C < 0) throw OmInvalidArgumentError("Parameter C in BM25 weighting formula must be >= 0");
00889             if (D < 0 || D > 1) throw OmInvalidArgumentError("Parameter D in BM25 weighting formula must be >= 0 and <= 1");
00890         }
00891         BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
00892                        weight_calculated(false) { }
00893 
00894         OmWeight * clone() const {
00895             return new BM25Weight(A, B, C, D, min_normlen);
00896         }
00897         ~BM25Weight() { }
00898         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
00899         om_weight get_maxpart() const;
00900 
00901         om_weight get_sumextra(om_doclength len) const;
00902         om_weight get_maxextra() const;
00903 
00904         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00905 };
00906 
00908 //
00909 // The Traditional weighting scheme formula is \f[
00910 //      \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
00911 // \f] where
00912 //   - \f$w_{t}\f$ is the termweight of term t
00913 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00914 //   - \f$L_{d}\f$ is the normalised length of document d
00915 //   - \f$k\f$ is a user specifiable parameter
00916 //
00917 // TradWeight is equivalent to BM25Weight(1, 1, 0, k, 0)
00918 class TradWeight : public OmWeight {
00919     private:
00920         mutable om_weight termweight;
00921         mutable om_doclength lenpart;
00922 
00923         double param_k;
00924 
00925         mutable bool weight_calculated;
00926 
00927         void calc_termweight() const;
00928 
00930         //
00931         // @param k  parameter governing the importance of within
00932         //           document frequency and document length - any positive
00933         //           number, 0 being wdf and doc length not used.  Default
00934         //           is 1.
00935         TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
00936             if (param_k < 0) throw OmInvalidArgumentError("Parameter k in traditional weighting formula must be >= 0");
00937         }
00938     public:
00939         OmWeight * clone() const {
00940             return new TradWeight(param_k);
00941         }
00942         ~TradWeight() { }
00943         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
00944         om_weight get_maxpart() const;
00945 
00946         om_weight get_sumextra(om_doclength len) const;
00947         om_weight get_maxextra() const;
00948 
00949         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00950 };
00951 
00952 #endif /* OM_HGUARD_OMENQUIRE_H */

Documentation for Xapian (version 0.6.1).
Generated on 7 Dec 2002 by Doxygen 1.2.15.