include/xapian/weight.h

Go to the documentation of this file.
00001 
00004 /* Copyright (C) 2007,2008,2009 Olly Betts
00005  * Copyright (C) 2009 Lemur Consulting Ltd
00006  *
00007  * This program is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU General Public License as
00009  * published by the Free Software Foundation; either version 2 of the
00010  * License, or (at your option) any later version.
00011  *
00012  * This program is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
00020  */
00021 
00022 #ifndef XAPIAN_INCLUDED_WEIGHT_H
00023 #define XAPIAN_INCLUDED_WEIGHT_H
00024 
00025 #include <string>
00026 
00027 #include <xapian/types.h>
00028 #include <xapian/visibility.h>
00029 
00030 namespace Xapian {
00031 
00033 class XAPIAN_VISIBILITY_DEFAULT Weight {
00034   protected:
00036     typedef enum {
00037         COLLECTION_SIZE = 1,
00038         RSET_SIZE = 2,
00039         AVERAGE_LENGTH = 4,
00040         TERMFREQ = 8,
00041         RELTERMFREQ = 16,
00042         QUERY_LENGTH = 32,
00043         WQF = 64,
00044         WDF = 128,
00045         DOC_LENGTH = 256,
00046         DOC_LENGTH_MIN = 512,
00047         DOC_LENGTH_MAX = 1024,
00048         WDF_MAX = 2048
00049     } stat_flags;
00050 
00060     void need_stat(stat_flags flag) {
00061         stats_needed = stat_flags(stats_needed | flag);
00062     }
00063 
00068     virtual void init(double factor) = 0;
00069 
00070   private:
00072     void operator=(const Weight &);
00073 
00083     virtual Weight * clone() const = 0;
00084 
00086     stat_flags stats_needed;
00087 
00089     Xapian::doccount collection_size_;
00090 
00092     Xapian::doccount rset_size_;
00093 
00095     Xapian::doclength average_length_;
00096 
00098     Xapian::doccount termfreq_;
00099 
00101     Xapian::doccount reltermfreq_;
00102 
00104     Xapian::termcount query_length_;
00105 
00107     Xapian::termcount wqf_;
00108 
00110     Xapian::termcount doclength_lower_bound_;
00111 
00113     Xapian::termcount doclength_upper_bound_;
00114 
00116     Xapian::termcount wdf_upper_bound_;
00117 
00118   public:
00119     class Internal;
00120 
00122     virtual ~Weight();
00123 
00138     virtual std::string name() const = 0;
00139 
00146     virtual std::string serialise() const = 0;
00147 
00157     virtual Weight * unserialise(const std::string & s) const = 0;
00158 
00167     virtual Xapian::weight get_sumpart(Xapian::termcount wdf,
00168                                        Xapian::termcount doclen) const = 0;
00169 
00175     virtual Xapian::weight get_maxpart() const = 0;
00176 
00184     virtual Xapian::weight get_sumextra(Xapian::termcount doclen) const = 0;
00185 
00192     virtual Xapian::weight get_maxextra() const = 0;
00193 
00201     Weight * clone_() const { return clone(); }
00202 
00212     void init_(const Internal & stats, Xapian::termcount query_len_,
00213                const std::string & term, Xapian::termcount wqf_,
00214                double factor);
00215 
00225     void init_(const Internal & stats, Xapian::termcount query_len_,
00226                double factor, Xapian::doccount termfreq,
00227                Xapian::doccount reltermfreq);
00228 
00235     void init_(const Internal & stats, Xapian::termcount query_len_);
00236 
00243     bool get_sumpart_needs_doclength_() const {
00244         return stats_needed & DOC_LENGTH;
00245     }
00246 
00252     bool get_sumpart_needs_wdf_() const {
00253         return stats_needed & WDF;
00254     }
00255 
00256   protected:
00258     Weight(const Weight &);
00259 
00261     Weight() : stats_needed() { }
00262 
00264     Xapian::doccount get_collection_size() const { return collection_size_; }
00265 
00267     Xapian::doccount get_rset_size() const { return rset_size_; }
00268 
00270     Xapian::doclength get_average_length() const { return average_length_; }
00271 
00273     Xapian::doccount get_termfreq() const { return termfreq_; }
00274 
00276     Xapian::doccount get_reltermfreq() const { return reltermfreq_; }
00277 
00279     Xapian::termcount get_query_length() const { return query_length_; }
00280 
00282     Xapian::termcount get_wqf() const { return wqf_; }
00283 
00288     Xapian::termcount get_doclength_upper_bound() const {
00289         return doclength_upper_bound_;
00290     }
00291 
00296     Xapian::termcount get_doclength_lower_bound() const {
00297         return doclength_lower_bound_;
00298     }
00299 
00304     Xapian::termcount get_wdf_upper_bound() const {
00305         return wdf_upper_bound_;
00306     }
00307 };
00308 
00313 class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
00314     BoolWeight * clone() const;
00315 
00316     void init(double factor);
00317 
00318   public:
00320     BoolWeight() { }
00321 
00322     std::string name() const;
00323 
00324     std::string serialise() const;
00325     BoolWeight * unserialise(const std::string & s) const;
00326 
00327     Xapian::weight get_sumpart(Xapian::termcount wdf,
00328                                Xapian::termcount doclen) const;
00329     Xapian::weight get_maxpart() const;
00330 
00331     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00332     Xapian::weight get_maxextra() const;
00333 };
00334 
00336 class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
00338     mutable Xapian::doclength len_factor;
00339 
00341     mutable Xapian::weight termweight;
00342 
00344     double param_k1, param_k2, param_k3, param_b;
00345 
00347     Xapian::doclength param_min_normlen;
00348 
00349     BM25Weight * clone() const;
00350 
00351     void init(double factor);
00352 
00353   public:
00381     BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
00382         : param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
00383           param_min_normlen(min_normlen)
00384     {
00385         if (param_k1 < 0) param_k1 = 0;
00386         if (param_k2 < 0) param_k2 = 0;
00387         if (param_k3 < 0) param_k3 = 0;
00388         if (param_b < 0) {
00389             param_b = 0;
00390         } else if (param_b > 1) {
00391             param_b = 1;
00392         }
00393         need_stat(COLLECTION_SIZE);
00394         need_stat(RSET_SIZE);
00395         need_stat(TERMFREQ);
00396         need_stat(RELTERMFREQ);
00397         need_stat(WDF);
00398         need_stat(WDF_MAX);
00399         need_stat(WDF);
00400         if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
00401             need_stat(DOC_LENGTH_MIN);
00402             need_stat(AVERAGE_LENGTH);
00403         }
00404         if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
00405         if (param_k2 != 0) need_stat(QUERY_LENGTH);
00406         if (param_k3 != 0) need_stat(WQF);
00407     }
00408 
00409     BM25Weight()
00410         : param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
00411           param_min_normlen(0.5)
00412     {
00413         need_stat(COLLECTION_SIZE);
00414         need_stat(RSET_SIZE);
00415         need_stat(TERMFREQ);
00416         need_stat(RELTERMFREQ);
00417         need_stat(WDF);
00418         need_stat(WDF_MAX);
00419         need_stat(WDF);
00420         need_stat(DOC_LENGTH_MIN);
00421         need_stat(AVERAGE_LENGTH);
00422         need_stat(DOC_LENGTH);
00423         need_stat(WQF);
00424     }
00425 
00426     std::string name() const;
00427 
00428     std::string serialise() const;
00429     BM25Weight * unserialise(const std::string & s) const;
00430 
00431     Xapian::weight get_sumpart(Xapian::termcount wdf,
00432                                Xapian::termcount doclen) const;
00433     Xapian::weight get_maxpart() const;
00434 
00435     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00436     Xapian::weight get_maxextra() const;
00437 };
00438 
00448 class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
00450     mutable Xapian::doclength len_factor;
00451 
00453     mutable Xapian::weight termweight;
00454 
00456     double param_k;
00457 
00458     TradWeight * clone() const;
00459 
00460     void init(double factor);
00461 
00462   public:
00470     explicit TradWeight(double k = 1.0) : param_k(k) {
00471         if (param_k < 0) param_k = 0;
00472         if (param_k != 0.0) {
00473             need_stat(AVERAGE_LENGTH);
00474             need_stat(DOC_LENGTH);
00475         }
00476         need_stat(COLLECTION_SIZE);
00477         need_stat(RSET_SIZE);
00478         need_stat(TERMFREQ);
00479         need_stat(RELTERMFREQ);
00480         need_stat(DOC_LENGTH_MIN);
00481         need_stat(WDF);
00482         need_stat(WDF_MAX);
00483         need_stat(WDF);
00484     }
00485 
00486     std::string name() const;
00487 
00488     std::string serialise() const;
00489     TradWeight * unserialise(const std::string & s) const;
00490 
00491     Xapian::weight get_sumpart(Xapian::termcount wdf,
00492                                Xapian::termcount doclen) const;
00493     Xapian::weight get_maxpart() const;
00494 
00495     Xapian::weight get_sumextra(Xapian::termcount doclen) const;
00496     Xapian::weight get_maxextra() const;
00497 };
00498 
00499 }
00500 
00501 #endif // XAPIAN_INCLUDED_WEIGHT_H

Documentation for Xapian (version 1.1.1).
Generated on 10 Jun 2009 by Doxygen 1.5.2.