Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

include/om/omenquire.h

Go to the documentation of this file.
00001 
00004 /* ----START-LICENCE----
00005  * Copyright 1999,2000,2001 BrightStation PLC
00006  * Copyright 2001,2002 Ananova Ltd
00007  * Copyright 2002 Olly Betts
00008  *
00009  * This program is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU General Public License as
00011  * published by the Free Software Foundation; either version 2 of the
00012  * License, or (at your option) any later version.
00013  *
00014  * This program is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
00022  * USA
00023  * -----END-LICENCE-----
00024  */
00025 
00026 #ifndef OM_HGUARD_OMENQUIRE_H
00027 #define OM_HGUARD_OMENQUIRE_H
00028 
00029 #include "om/omtypes.h"
00030 #include "om/omdocument.h"
00031 #include "om/omdatabase.h"
00032 #include "om/omerror.h"
00033 #include <string>
00034 
00035 class OmQuery;
00036 class OmErrorHandler;
00037 class OmWeight;
00038 
00042 class OmMSetIterator {
00043     public:
00044         friend class OmMSet;
00045 
00046         class Internal;
00048         Internal *internal;
00049 
00050         friend bool operator==(const OmMSetIterator &a,
00051                                const OmMSetIterator &b);
00052 
00053     private:
00054         OmMSetIterator(Internal *internal_);
00055 
00056     public:
00060         OmMSetIterator();
00061 
00062         ~OmMSetIterator();
00063 
00065         OmMSetIterator(const OmMSetIterator &other);
00066 
00068         void operator=(const OmMSetIterator &other);
00069 
00071         OmMSetIterator & operator++();
00072 
00073         void operator++(int);
00074 
00076         om_docid operator *() const;
00077 
00096         OmDocument get_document() const;
00097 
00104         om_doccount get_rank() const;
00105 
00107         om_weight get_weight() const;
00108 
00113         om_percent get_percent() const;
00114 
00118         std::string get_description() const;
00119 
00121 
00122         typedef std::input_iterator_tag iterator_category;
00123         typedef om_docid value_type;
00124         typedef om_doccount_diff difference_type;
00125         typedef om_docid * pointer;
00126         typedef om_docid & reference;
00128 };
00129 
00130 inline bool operator!=(const OmMSetIterator &a,
00131                        const OmMSetIterator &b)
00132 {
00133     return !(a == b);
00134 }
00135 
00139 class OmMSet {
00140     public:
00141         class Internal;
00143         Internal *internal;
00144 
00145     public:
00146         // FIXME: public for now, private would be better
00148         OmMSet(OmMSet::Internal * internal_);
00149 
00151         OmMSet();
00152 
00154         ~OmMSet();
00155 
00157         OmMSet(const OmMSet & other);
00158 
00160         void operator=(const OmMSet &other);
00161 
00177         void fetch(const OmMSetIterator &begin,
00178                    const OmMSetIterator &end) const;
00179 
00182         void fetch(const OmMSetIterator &item) const;
00183 
00186         void fetch() const;
00187 
00192         om_percent convert_to_percent(om_weight wt) const;
00193 
00195         om_percent convert_to_percent(const OmMSetIterator &it) const;
00196 
00204         om_doccount get_termfreq(const om_termname &tname) const;
00205 
00213         om_weight get_termweight(const om_termname &tname) const;
00214 
00221         om_doccount get_firstitem() const;
00222 
00229         om_doccount get_matches_lower_bound() const;
00230 
00240         om_doccount get_matches_estimated() const;
00241 
00248         om_doccount get_matches_upper_bound() const;
00249 
00255         om_weight get_max_possible() const;
00256 
00270         om_weight get_max_attained() const;
00271 
00272         om_doccount size() const;
00273 
00274         om_doccount max_size() const;
00275 
00276         bool empty() const;
00277 
00278         void swap(OmMSet & other);
00279 
00280         OmMSetIterator begin() const;
00281 
00282         OmMSetIterator end() const;
00283 
00284         OmMSetIterator back() const;
00285         
00295         OmMSetIterator operator[](om_doccount i) const;
00296 
00298 
00299         typedef std::input_iterator_tag iterator_category;
00300         typedef OmMSetIterator value_type; // FIXME: not assignable...
00301         typedef OmMSetIterator iterator;
00302         typedef OmMSetIterator const_iterator;
00303         typedef OmMSetIterator & reference; // Hmm
00304         typedef OmMSetIterator & const_reference;
00305         typedef OmMSetIterator * pointer; // Hmm
00306         typedef om_doccount_diff difference_type;
00307         typedef om_doccount size_type;
00309         
00313         std::string get_description() const;
00314 };
00315 
00317 class OmESetIterator {
00318     public:
00319         friend class OmESet;
00320         class Internal;
00322         Internal *internal;
00323 
00324         friend bool operator==(const OmESetIterator &a,
00325                                const OmESetIterator &b);
00326 
00327     private:
00328 
00329         OmESetIterator(Internal *internal_);
00330 
00331     public:
00335         OmESetIterator();
00336 
00338         ~OmESetIterator();
00339 
00341         OmESetIterator(const OmESetIterator &other);
00342 
00344         void operator=(const OmESetIterator &other);
00345 
00346         OmESetIterator & operator++();
00347 
00348         void operator++(int);
00349 
00351         const om_termname & operator *() const;
00352 
00354         om_weight get_weight() const;
00355 
00359         std::string get_description() const;
00360 
00362 
00363         typedef std::input_iterator_tag iterator_category;
00364         typedef om_termname value_type;
00365         typedef om_termcount_diff difference_type;
00366         typedef om_termname * pointer;
00367         typedef om_termname & reference;
00369 };
00370 
00371 inline bool
00372 operator!=(const OmESetIterator &a, const OmESetIterator &b)
00373 {
00374     return !(a == b);
00375 }
00376 
00381 class OmESet {
00382     public:
00383         class Internal;
00385         Internal *internal;
00386 
00388         OmESet();
00389 
00391         ~OmESet();
00392 
00394         OmESet(const OmESet & other);
00395 
00397         void operator=(const OmESet &other);
00398 
00403         om_termcount get_ebound() const;
00404 
00406         om_termcount size() const;
00407 
00409         bool empty() const;
00410 
00412         OmESetIterator begin() const;
00413 
00415         OmESetIterator end() const;
00416 
00421         std::string get_description() const;
00422 };
00423 
00428 class OmRSet {
00429     public:
00431         class Internal;
00433         Internal *internal;
00434 
00436         OmRSet(const OmRSet &rset);
00437 
00439         void operator=(const OmRSet &rset);
00440 
00442         OmRSet();
00443 
00445         ~OmRSet();
00446 
00448         om_doccount size() const;
00449 
00451         bool empty() const;
00452 
00454         void add_document(om_docid did);
00455         
00457         void add_document(const OmMSetIterator & i) { add_document(*i); }
00458 
00460         void remove_document(om_docid did);
00461 
00463         void remove_document(const OmMSetIterator & i) { remove_document(*i); }
00464 
00466         bool contains(om_docid did) const;
00467 
00469         bool contains(const OmMSetIterator & i) { return contains(*i); }
00470 
00475         std::string get_description() const;
00476 };
00477 
00480 class OmMatchDecider {
00481     public:
00484         virtual int operator()(const OmDocument &doc) const = 0;
00485 
00487         virtual ~OmMatchDecider() {}
00488 };
00489 
00492 class OmExpandDecider {
00493     public:
00496         virtual int operator()(const om_termname & tname) const = 0;
00497 
00499         virtual ~OmExpandDecider() {}
00500 };
00501 
00515 class OmEnquire {
00516     private:
00518         OmEnquire(const OmEnquire &);
00519 
00521         void operator=(const OmEnquire &);
00522 
00523     public:
00524         class Internal;
00526         Internal *internal;
00527 
00543         OmEnquire(const OmDatabase &databases,
00544                   OmErrorHandler * errorhandler_ = 0);
00545 
00554         ~OmEnquire();
00555 
00563         void set_query(const OmQuery & query_);
00564 
00571         const OmQuery & get_query();
00572 
00579         void set_weighting_scheme(const OmWeight &weight_);
00580 
00587         void set_collapse_key(om_valueno collapse_key);
00588 
00595         void set_sort_forward(bool sort_forward);
00596 
00614         void set_cutoff(int percent_cutoff, om_weight weight_cutoff = 0);
00615 
00629         void set_sorting(om_valueno sort_key, int sort_bands);
00630 
00642         void set_bias(om_weight bias_weight, time_t bias_halflife);
00643 
00664         OmMSet get_mset(om_doccount first,
00665                         om_doccount maxitems,
00666                         const OmRSet * omrset = 0,
00667                         const OmMatchDecider * mdecider = 0) const;
00668 
00669         static const int include_query_terms = 1;
00670         static const int use_exact_termfreq = 2;
00694         OmESet get_eset(om_termcount maxitems,
00695                         const OmRSet & omrset,
00696                         int flags = 0,
00697                         double k = 1.0,
00698                         const OmExpandDecider * edecider = 0) const;
00699 
00714         inline OmESet get_eset(om_termcount maxitems, const OmRSet & omrset,
00715                                const OmExpandDecider * edecider) const {
00716             return get_eset(maxitems, omrset, 0, 1.0, edecider);
00717         }
00718 
00748         OmTermIterator get_matching_terms_begin(om_docid did) const;
00749 
00751         OmTermIterator get_matching_terms_end(om_docid did) const;
00752 
00776         OmTermIterator get_matching_terms_begin(const OmMSetIterator &it) const;
00777 
00779         OmTermIterator get_matching_terms_end(const OmMSetIterator &it) const;
00780 
00783         void register_match_decider(const std::string &name,
00784                                     const OmMatchDecider *mdecider = NULL);
00785 
00789         std::string get_description() const;
00790 };
00791 
00792 class SocketServer;
00793 
00795 class OmWeight {
00796     friend class OmEnquire; // So OmEnquire can clone us
00797     friend class SocketServer; // So SocketServer can clone us - FIXME
00798     public:
00799         class Internal;
00800     private:
00801         OmWeight(const OmWeight &);
00802         void operator=(OmWeight &);
00803 
00805         //
00806         // Each subclass should implement this as:
00807         // virtual OmFooWeight * clone() const {
00808         //     return new OmFooWeight(param1, param2);
00809         // }
00810         virtual OmWeight * clone() const = 0;
00811 
00812     protected:
00813         const Internal * internal; // OmWeight::Internal == StatsSource
00814         om_doclength querysize;
00815         om_termcount wqf;
00816         om_termname tname;
00817 
00818     public:
00819         OmWeight() { }
00820         virtual ~OmWeight() { }
00821 
00833         OmWeight * create(const Internal * internal_, om_doclength querysize_,
00834                           om_termcount wqf_, om_termname tname_) const {
00835             OmWeight * wt = clone();
00836             wt->internal = internal_;
00837             wt->querysize = querysize_;
00838             wt->wqf = wqf_;
00839             wt->tname = tname_;
00840             return wt;
00841         }
00842 
00844         //
00845         //  If the subclass is called FooWeight, this should return "Foo".
00846         virtual std::string name() const = 0;
00847 
00849         virtual std::string serialise() const = 0;
00850 
00852         virtual OmWeight * OmWeight::unserialise(const std::string &s) const = 0;
00853 
00861         virtual om_weight get_sumpart(om_termcount wdf,
00862                                       om_doclength len) const = 0;
00863 
00869         virtual om_weight get_maxpart() const = 0;
00870 
00879         virtual om_weight get_sumextra(om_doclength len) const = 0;
00880 
00884         virtual om_weight get_maxextra() const = 0;
00885 
00887         virtual bool get_sumpart_needs_doclength() const { return true; }
00888 };
00889 
00891 class BoolWeight : public OmWeight {
00892     public:
00893         OmWeight * clone() const {
00894             return new BoolWeight;
00895         }
00896         BoolWeight() { }
00897         ~BoolWeight() { }
00898         std::string name() const { return "Bool"; }
00899         std::string serialise() const { return ""; }
00900         OmWeight * unserialise(const std::string & /*s*/) const {
00901             return new BoolWeight;
00902         }
00903         om_weight get_sumpart(om_termcount /*wdf*/, om_doclength /*len*/) const { return 0; }
00904         om_weight get_maxpart() const { return 0; }
00905 
00906         om_weight get_sumextra(om_doclength /*len*/) const { return 0; }
00907         om_weight get_maxextra() const { return 0; }
00908 
00909         bool get_sumpart_needs_doclength() const { return false; }      
00910 };
00911 
00913 //
00914 // BM25 weighting options : The BM25 formula is \f[
00915 //      \frac{C.s_{q}}{1+L_{d}}+\sum_{t}\frac{(A+1)q_{t}}{A+q_{t}}.\frac{(B+1)f_{t,d}}{B((1-D)+DL_{d})+f_{t,d}}.w_{t}
00916 // \f] where
00917 //   - \f$w_{t}\f$ is the termweight of term t
00918 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00919 //   - \f$q_{t}\f$ is the within query frequency of term t
00920 //   - \f$L_{d}\f$ is the normalised length of document d
00921 //   - \f$s_{q}\f$ is the size of the query
00922 //   - \f$A\f$, \f$B\f$, \f$C\f$ and \f$D\f$ are user specified parameters
00923 class BM25Weight : public OmWeight {
00924     private:
00925         mutable om_weight termweight;
00926         mutable om_doclength lenpart;
00927         mutable double BD;
00928 
00929         double A, B, C, D;
00930         om_doclength min_normlen;
00931 
00932         mutable bool weight_calculated;
00933 
00934         void calc_termweight() const;
00935 
00936     public:
00955         BM25Weight(double A_, double B_, double C_, double D_,
00956                    double min_normlen_)
00957                 : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
00958                   weight_calculated(false)
00959         {
00960             if (A < 0) A = 0;
00961             if (B < 0) B = 0;
00962             if (C < 0) C = 0;
00963             if (D < 0) D = 0; else if (D > 1) D = 1;
00964         }
00965         BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
00966                        weight_calculated(false) { }
00967 
00968         OmWeight * clone() const {
00969             return new BM25Weight(A, B, C, D, min_normlen);
00970         }
00971         ~BM25Weight() { }
00972         std::string name() const { return "BM25"; }
00973         std::string serialise() const;
00974         OmWeight * unserialise(const std::string & s) const;
00975         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
00976         om_weight get_maxpart() const;
00977 
00978         om_weight get_sumextra(om_doclength len) const;
00979         om_weight get_maxextra() const;
00980 
00981         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00982 };
00983 
00985 //
00986 // The Traditional weighting scheme formula is \f[
00987 //      \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
00988 // \f] where
00989 //   - \f$w_{t}\f$ is the termweight of term t
00990 //   - \f$f_{t,d}\f$ is the within document frequency of term t in document d
00991 //   - \f$L_{d}\f$ is the normalised length of document d
00992 //   - \f$k\f$ is a user specifiable parameter
00993 //
00994 // TradWeight is equivalent to BM25Weight(1, 1, 0, k, 0)
00995 class TradWeight : public OmWeight {
00996     private:
00997         mutable om_weight termweight;
00998         mutable om_doclength lenpart;
00999 
01000         double param_k;
01001 
01002         mutable bool weight_calculated;
01003 
01004         void calc_termweight() const;
01005 
01006     public:
01008         //
01009         // @param k  parameter governing the importance of within
01010         //           document frequency and document length - any positive
01011         //           number, 0 being wdf and doc length not used.  Default
01012         //           is 1.
01013         TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
01014             if (param_k < 0) param_k = 0;
01015         }
01016         OmWeight * clone() const {
01017             return new TradWeight(param_k);
01018         }
01019         ~TradWeight() { }
01020         std::string name() const { return "Trad"; }
01021         std::string serialise() const;
01022         OmWeight * unserialise(const std::string & s) const;
01023         
01024         om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
01025         om_weight get_maxpart() const;
01026 
01027         om_weight get_sumextra(om_doclength len) const;
01028         om_weight get_maxextra() const;
01029 
01030         bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
01031 };
01032 
01033 #endif /* OM_HGUARD_OMENQUIRE_H */

Documentation for Xapian (version 0.6.3).
Generated on 14 Dec 2002 by Doxygen 1.2.15.