00001
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #ifndef OM_HGUARD_OMENQUIRE_H
00027 #define OM_HGUARD_OMENQUIRE_H
00028
00029 #include "om/omtypes.h"
00030 #include "om/omdocument.h"
00031 #include "om/omdatabase.h"
00032 #include "om/omerror.h"
00033 #include "om/omsettings.h"
00034 #include <string>
00035
00036 class OmQuery;
00037 class OmErrorHandler;
00038 class OmWeight;
00039
00043 class OmMSetIterator {
00044 public:
00045 friend class OmMSet;
00046
00047 class Internal;
00049 Internal *internal;
00050
00051 friend bool operator==(const OmMSetIterator &a,
00052 const OmMSetIterator &b);
00053
00054 private:
00055 OmMSetIterator(Internal *internal_);
00056
00057 public:
00061 OmMSetIterator();
00062
00063 ~OmMSetIterator();
00064
00066 OmMSetIterator(const OmMSetIterator &other);
00067
00069 void operator=(const OmMSetIterator &other);
00070
00072 OmMSetIterator & operator++();
00073
00074 void operator++(int);
00075
00077 om_docid operator *() const;
00078
00097 OmDocument get_document() const;
00098
00105 om_doccount get_rank() const;
00106
00108 om_weight get_weight() const;
00109
00114 om_percent get_percent() const;
00115
00119 std::string get_description() const;
00120
00122
00123 typedef std::input_iterator_tag iterator_category;
00124 typedef om_docid value_type;
00125 typedef om_doccount_diff difference_type;
00126 typedef om_docid * pointer;
00127 typedef om_docid & reference;
00129 };
00130
00131 inline bool operator!=(const OmMSetIterator &a,
00132 const OmMSetIterator &b)
00133 {
00134 return !(a == b);
00135 }
00136
00140 class OmMSet {
00141 public:
00142 class Internal;
00144 Internal *internal;
00145
00146 public:
00147
00149
00150
00152 OmMSet();
00153
00155 ~OmMSet();
00156
00158 OmMSet(const OmMSet & other);
00159
00161 void operator=(const OmMSet &other);
00162
00178 void fetch(const OmMSetIterator &begin,
00179 const OmMSetIterator &end) const;
00180
00183 void fetch(const OmMSetIterator &item) const;
00184
00187 void fetch() const;
00188
00193 om_percent convert_to_percent(om_weight wt) const;
00194
00196 om_percent convert_to_percent(const OmMSetIterator &it) const;
00197
00205 om_doccount get_termfreq(const om_termname &tname) const;
00206
00214 om_weight get_termweight(const om_termname &tname) const;
00215
00222 om_doccount get_firstitem() const;
00223
00230 om_doccount get_matches_lower_bound() const;
00231
00241 om_doccount get_matches_estimated() const;
00242
00249 om_doccount get_matches_upper_bound() const;
00250
00256 om_weight get_max_possible() const;
00257
00271 om_weight get_max_attained() const;
00272
00273 om_doccount size() const;
00274
00275 om_doccount max_size() const;
00276
00277 bool empty() const;
00278
00279 void swap(OmMSet & other);
00280
00281 OmMSetIterator begin() const;
00282
00283 OmMSetIterator end() const;
00284
00285 OmMSetIterator back() const;
00286
00296 OmMSetIterator operator[](om_doccount i) const;
00297
00299
00300 typedef std::input_iterator_tag iterator_category;
00301 typedef OmMSetIterator value_type;
00302 typedef OmMSetIterator iterator;
00303 typedef OmMSetIterator const_iterator;
00304 typedef OmMSetIterator & reference;
00305 typedef OmMSetIterator & const_reference;
00306 typedef OmMSetIterator * pointer;
00307 typedef om_doccount_diff difference_type;
00308 typedef om_doccount size_type;
00310
00314 std::string get_description() const;
00315 };
00316
00318 class OmESetIterator {
00319 public:
00320 friend class OmESet;
00321 class Internal;
00323 Internal *internal;
00324
00325 friend bool operator==(const OmESetIterator &a,
00326 const OmESetIterator &b);
00327
00328 private:
00329
00330 OmESetIterator(Internal *internal_);
00331
00332 public:
00336 OmESetIterator();
00337
00339 ~OmESetIterator();
00340
00342 OmESetIterator(const OmESetIterator &other);
00343
00345 void operator=(const OmESetIterator &other);
00346
00347 OmESetIterator & operator++();
00348
00349 void operator++(int);
00350
00352 const om_termname & operator *() const;
00353
00355 om_weight get_weight() const;
00356
00360 std::string get_description() const;
00361
00363
00364 typedef std::input_iterator_tag iterator_category;
00365 typedef om_termname value_type;
00366 typedef om_termcount_diff difference_type;
00367 typedef om_termname * pointer;
00368 typedef om_termname & reference;
00370 };
00371
00372 inline bool
00373 operator!=(const OmESetIterator &a, const OmESetIterator &b)
00374 {
00375 return !(a == b);
00376 }
00377
00382 class OmESet {
00383 public:
00384 class Internal;
00386 Internal *internal;
00387
00389 OmESet();
00390
00392 ~OmESet();
00393
00395 OmESet(const OmESet & other);
00396
00398 void operator=(const OmESet &other);
00399
00404 om_termcount get_ebound() const;
00405
00407 om_termcount size() const;
00408
00410 bool empty() const;
00411
00413 OmESetIterator begin() const;
00414
00416 OmESetIterator end() const;
00417
00422 std::string get_description() const;
00423 };
00424
00429 class OmRSet {
00430 public:
00432 class Internal;
00434 Internal *internal;
00435
00437 OmRSet(const OmRSet &rset);
00438
00440 void operator=(const OmRSet &rset);
00441
00443 OmRSet();
00444
00446 ~OmRSet();
00447
00449 om_doccount size() const;
00450
00452 bool empty() const;
00453
00455 void add_document(om_docid did);
00456
00458 void add_document(const OmMSetIterator & i) { add_document(*i); }
00459
00461 void remove_document(om_docid did);
00462
00464 void remove_document(const OmMSetIterator & i) { remove_document(*i); }
00465
00467 bool contains(om_docid did) const;
00468
00470 bool contains(const OmMSetIterator & i) { return contains(*i); }
00471
00476 std::string get_description() const;
00477 };
00478
00481 class OmMatchDecider {
00482 public:
00485 virtual int operator()(const OmDocument &doc) const = 0;
00486
00488 virtual ~OmMatchDecider() {}
00489 };
00490
00493 class OmExpandDecider {
00494 public:
00497 virtual int operator()(const om_termname & tname) const = 0;
00498
00500 virtual ~OmExpandDecider() {}
00501 };
00502
00516 class OmEnquire {
00517 private:
00519 OmEnquire(const OmEnquire &);
00520
00522 void operator=(const OmEnquire &);
00523
00524 public:
00525 class Internal;
00527 Internal *internal;
00528
00544 OmEnquire(const OmDatabase &databases,
00545 OmErrorHandler * errorhandler_ = 0);
00546
00555 ~OmEnquire();
00556
00564 void set_query(const OmQuery & query_);
00565
00572 const OmQuery & get_query();
00573
00580 void set_weighting_scheme(const OmWeight &weight_);
00581
00603 OmMSet get_mset(om_doccount first,
00604 om_doccount maxitems,
00605 const OmRSet * omrset = 0,
00606 const OmSettings * moptions = 0,
00607 const OmMatchDecider * mdecider = 0) const;
00608
00630 OmESet get_eset(om_termcount maxitems,
00631 const OmRSet & omrset,
00632 bool exclude_query_terms = true,
00633 bool use_exact_termfreq = false,
00634 double k = 1.0,
00635 const OmExpandDecider * edecider = 0) const;
00636
00651 inline OmESet get_eset(om_termcount maxitems, const OmRSet & omrset,
00652 const OmExpandDecider * edecider) const {
00653 return get_eset(maxitems, omrset, true, false, 1.0, edecider);
00654 }
00655
00656
00657
00658 inline OmESet get_eset(om_termcount, const OmRSet &, OmSettings *,
00659 OmExpandDecider *dummy = 0) const {
00660 (void)dummy;
00661 throw "You need to update this call to OmEnquire::get_eset() "
00662 "- it no longer takes an OmSettings * parameter";
00663 }
00664
00694 OmTermIterator get_matching_terms_begin(om_docid did) const;
00695
00697 OmTermIterator get_matching_terms_end(om_docid did) const;
00698
00722 OmTermIterator get_matching_terms_begin(const OmMSetIterator &it) const;
00723
00725 OmTermIterator get_matching_terms_end(const OmMSetIterator &it) const;
00726
00729 void register_match_decider(const std::string &name,
00730 const OmMatchDecider *mdecider = NULL);
00731
00735 std::string get_description() const;
00736 };
00737
00739 class OmWeight {
00740 friend class OmEnquire;
00741 public:
00742 class Internal;
00743 private:
00744 OmWeight(const OmWeight &);
00745 void operator=(OmWeight &);
00746
00748
00749
00750
00751
00752
00753 virtual OmWeight * clone() const = 0;
00754
00755 protected:
00756 const Internal * internal;
00757 om_doclength querysize;
00758 om_termcount wqf;
00759 om_termname tname;
00760
00761 public:
00762 OmWeight() { }
00763 virtual ~OmWeight() { }
00764
00775 OmWeight * create(const Internal * internal_, om_doclength querysize_,
00776 om_termcount wqf_, om_termname tname_) {
00777 OmWeight * wt = clone();
00778 wt->internal = internal_;
00779 wt->querysize = querysize_;
00780 wt->wqf = wqf_;
00781 wt->tname = tname_;
00782 return wt;
00783 }
00784
00792 virtual om_weight get_sumpart(om_termcount wdf,
00793 om_doclength len) const = 0;
00794
00800 virtual om_weight get_maxpart() const = 0;
00801
00810 virtual om_weight get_sumextra(om_doclength len) const = 0;
00811
00815 virtual om_weight get_maxextra() const = 0;
00816
00818 virtual bool get_sumpart_needs_doclength() const { return true; }
00819 };
00820
00822 class BoolWeight : public OmWeight {
00823 public:
00824 OmWeight * clone() const {
00825 return new BoolWeight;
00826 }
00827 BoolWeight() { }
00828 ~BoolWeight() { }
00829 om_weight get_sumpart(om_termcount , om_doclength ) const { return 0; }
00830 om_weight get_maxpart() const { return 0; }
00831
00832 om_weight get_sumextra(om_doclength ) const { return 0; }
00833 om_weight get_maxextra() const { return 0; }
00834
00835 bool get_sumpart_needs_doclength() const { return false; }
00836 };
00837
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849 class BM25Weight : public OmWeight {
00850 private:
00851 mutable om_weight termweight;
00852 mutable om_doclength lenpart;
00853 mutable double BD;
00854
00855 double A, B, C, D;
00856 om_doclength min_normlen;
00857
00858 mutable bool weight_calculated;
00859
00860 void calc_termweight() const;
00861
00862 public:
00881 BM25Weight(double A_, double B_, double C_, double D_,
00882 double min_normlen_)
00883 : A(A_), B(B_), C(C_), D(D_), min_normlen(min_normlen_),
00884 weight_calculated(false)
00885 {
00886 if (A < 0) throw OmInvalidArgumentError("Parameter A in BM25 weighting formula must be >= 0");
00887 if (B < 0) throw OmInvalidArgumentError("Parameter B in BM25 weighting formula must be >= 0");
00888 if (C < 0) throw OmInvalidArgumentError("Parameter C in BM25 weighting formula must be >= 0");
00889 if (D < 0 || D > 1) throw OmInvalidArgumentError("Parameter D in BM25 weighting formula must be >= 0 and <= 1");
00890 }
00891 BM25Weight() : A(1), B(1), C(0), D(0.5), min_normlen(0.5),
00892 weight_calculated(false) { }
00893
00894 OmWeight * clone() const {
00895 return new BM25Weight(A, B, C, D, min_normlen);
00896 }
00897 ~BM25Weight() { }
00898 om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
00899 om_weight get_maxpart() const;
00900
00901 om_weight get_sumextra(om_doclength len) const;
00902 om_weight get_maxextra() const;
00903
00904 bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00905 };
00906
00908
00909
00910
00911
00912
00913
00914
00915
00916
00917
00918 class TradWeight : public OmWeight {
00919 private:
00920 mutable om_weight termweight;
00921 mutable om_doclength lenpart;
00922
00923 double param_k;
00924
00925 mutable bool weight_calculated;
00926
00927 void calc_termweight() const;
00928
00930
00931
00932
00933
00934
00935 TradWeight(double k = 1) : param_k(k), weight_calculated(false) {
00936 if (param_k < 0) throw OmInvalidArgumentError("Parameter k in traditional weighting formula must be >= 0");
00937 }
00938 public:
00939 OmWeight * clone() const {
00940 return new TradWeight(param_k);
00941 }
00942 ~TradWeight() { }
00943 om_weight get_sumpart(om_termcount wdf, om_doclength len) const;
00944 om_weight get_maxpart() const;
00945
00946 om_weight get_sumextra(om_doclength len) const;
00947 om_weight get_maxextra() const;
00948
00949 bool get_sumpart_needs_doclength() const { return (lenpart != 0); }
00950 };
00951
00952 #endif