Index: Makefile.am
===================================================================
--- Makefile.am	(revision 7042)
+++ Makefile.am	(working copy)
@@ -61,7 +61,7 @@
 
 libxapian_la_LIBADD = common/libcommon.la backends/libbackend.la \
  matcher/libmatcher.la languages/liblanguages.la api/libapi.la $(LIBNET_LA) \
- queryparser/libqueryparser.la
+ queryparser/libqueryparser.la -lz
 
 dist_man_MANS = xapian-config.1
 
Index: backends/flint/flint_table.cc
===================================================================
--- backends/flint/flint_table.cc	(revision 7042)
+++ backends/flint/flint_table.cc	(working copy)
@@ -69,6 +69,7 @@
 #include "utils.h"
 
 #include <algorithm>  // for std::min()
+#include <iostream>
 #include <string>
 
 #ifdef __WIN32__
@@ -79,6 +80,13 @@
 # endif
 #endif
 
+#ifdef COMPRESSED_TAGS
+#include <zlib.h>
+// Only try to compress tags longer than this many bytes.
+const size_t COMPRESS_MIN = 4;
+#define DONT_COMPRESS -1
+#endif /* COMPRESSED_TAGS */
+
 // Only useful for platforms like Windows which distinguish between text and
 // binary files.
 #ifndef __WIN32__
@@ -1149,6 +1157,62 @@
 
     form_key(key);
 
+#ifdef COMPRESSED_TAGS
+    bool compressed = false;
+    if (compress_strategy != DONT_COMPRESS && tag.size() > COMPRESS_MIN) {
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Bytef *)const_cast<char *>(tag.data());
+	stream.avail_in = (uInt)tag.size();
+
+	// If compressed size is >= tag.size(), we don't want to compress.
+	unsigned long blk_len = tag.size() - 1;
+	unsigned char * blk = new unsigned char[blk_len];
+	stream.next_out = blk;
+	stream.avail_out = (uInt)blk_len;
+
+	stream.zalloc = reinterpret_cast<alloc_func>(0);
+	stream.zfree = reinterpret_cast<free_func>(0);
+	stream.opaque = (voidpf)0;
+
+	//cout << "deflateInit about to be called\n" << endl;
+	// -15 means raw deflate with 32K LZ77 window (largest)
+	// memLevel 9 is the highest (8 is default)
+	err = deflateInit2(&stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, 9,
+			   compress_strategy);
+	//cout << "deflateInit returns " << Z_OK << endl;
+	if (err != Z_OK) abort();
+	if (!compress_dict.empty()) {
+	    err = deflateSetDictionary(&stream,
+		    reinterpret_cast<const Bytef *>(compress_dict.data()),
+		    compress_dict.size());
+	    //cout << "deflateSetDictionary returns " << Z_OK << endl;
+	    if (err != Z_OK) abort();
+	}
+
+	err = deflate(&stream, Z_FINISH);
+	if (err == Z_STREAM_END) {
+	    // If deflate succeeded, then the output was at least one byte
+	    // smaller than the input.
+	    //cout << "deflate compressed "<<tag.size()<<"["<<tag<<"] to ";
+	    tag.assign(reinterpret_cast<const char *>(blk), stream.total_out);
+	    //cout << tag.size()<<"["<<tag<<"]"<<endl;
+	    compressed = true;
+	    err = deflateEnd(&stream);
+	    if (err != Z_OK) {
+		cerr << "deflateEnd failed, err = " << err << endl;
+		abort();
+	    }
+	} else {
+	    // Deflate failed - presumably the data wasn't compressible.
+	    (void)deflateEnd(&stream);
+	}
+
+	delete [] blk;
+    }
+#endif /* COMPRESSED_TAGS */
+
     // sort of matching kt.append_chunk(), but setting the chunk
     const size_t cd = kt.key().length() + K1 + I2 + C2 + C2;  // offset to the tag data
     const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
@@ -1196,7 +1260,11 @@
 	size_t l = (i == m ? residue : (i == 1 ? first_L : L));
 	Assert(cd + l <= block_size);
 	Assert(string::size_type(o + l) <= tag.length());
+#ifndef COMPRESSED_TAGS
 	kt.set_tag(cd, tag.data() + o, l);
+#else /* COMPRESSED_TAGS */
+	kt.set_tag(cd, tag.data() + o, l, compressed);
+#endif /* COMPRESSED_TAGS */
 	kt.set_component_of(i);
 	
 	o += l;
@@ -1284,6 +1352,9 @@
     if (n > 1) tag->reserve((max_item_size - (1 + K1 + I2 + C2 + C2)) * n);
 
     item.append_chunk(tag);
+#ifdef COMPRESSED_TAGS
+    bool compressed = item.get_compressed();
+#endif /* COMPRESSED_TAGS */
 
     for (int i = 2; i <= n; i++) {
 	next(C_, 0);
@@ -1291,6 +1362,103 @@
     }
     // At this point the cursor is on the last item - calling next will move
     // it to the next key (FlintCursor::get_tag() relies on this).
+#ifdef COMPRESSED_TAGS
+    if (!compressed) return;
+
+    // FIXME: Perhaps we should we decompress each chunk as we read it so we
+    // don't need both the full compressed and uncompressed tags in memory
+    // at once.  But at least this makes for a cleaner patch while this is
+    // still somewhat experimental.
+    z_stream stream;
+    int err;
+
+    string utag;
+    // May not be enough for a compressed tag, but it's a reasonable guess.
+    utag.reserve(tag->size() + tag->size() / 2);
+
+    Bytef buf[8192];
+
+    //cout << "inflating" << endl;
+    stream.next_out = buf;
+    stream.avail_out = (uInt)sizeof(buf);
+
+    stream.zalloc = reinterpret_cast<alloc_func>(0);
+    stream.zfree = reinterpret_cast<free_func>(0);
+
+    Bytef header[6] = { 0x78, 0x20, 0, 0, 0, 0};
+    if (!compress_dict.empty()) {
+	stream.next_in = header;
+	stream.avail_in = 6;
+	uLong adler = adler32(0L, Z_NULL, 0);
+	stream.adler = adler;
+	adler = adler32(adler,
+			reinterpret_cast<const Bytef *>(compress_dict.data()),
+			compress_dict.size());
+	setint4(header, 2, adler);
+	// cout << "inflateInit" << endl;
+	err = inflateInit(&stream);
+	if (err != Z_OK) {
+	    cout << "inflateInit err = " << err << endl;
+	    abort();
+	}
+	err = inflate(&stream, Z_SYNC_FLUSH);
+	if (err != Z_NEED_DICT) {
+	    cout << "don't need dict?!" << endl;
+	    abort();
+	}
+	err = inflateSetDictionary(&stream,
+		reinterpret_cast<const Bytef *>(compress_dict.data()),
+		compress_dict.size());
+	if (err != Z_OK) {
+	    cout << "iSD failed with err = " << err << endl;
+	}
+    } else {
+	stream.next_in = Z_NULL;
+	stream.avail_in = 0;
+	// cout << "inflateInit2" << endl;
+	err = inflateInit2(&stream, -15);
+	if (err != Z_OK) {
+	    cout << "inflateInit2 err = " << err << endl;
+	    abort();
+	}
+    }
+
+    stream.next_in = (Bytef*)const_cast<char *>(tag->data());
+    stream.avail_in = (uInt)tag->size();
+
+    while (err != Z_STREAM_END) {
+	stream.next_out = buf;
+	stream.avail_out = (uInt)sizeof(buf);
+	err = inflate(&stream, Z_SYNC_FLUSH);
+	if (err == Z_BUF_ERROR && stream.avail_in == 0) {
+	    cout << "Z_BUF_ERROR - faking checksum of " << stream.adler << endl;
+	    Bytef header2[4];
+	    setint4(header2, 0, stream.adler);
+	    stream.next_in = header2;
+	    stream.avail_in = 4;
+	    err = inflate(&stream, Z_SYNC_FLUSH);
+	    if (err == Z_STREAM_END) break;
+	}
+
+	if (err != Z_OK && err != Z_STREAM_END) {
+	    cout << "inflate err = " << err << endl;
+	    abort();
+	}
+
+	utag.append(reinterpret_cast<const char *>(buf),
+		    stream.next_out - buf);
+    }
+    Assert(utag.size() == stream.total_out);
+    if (utag.size() != stream.total_out) {
+	cout << "utag.size() != stream.total_out (" << utag.size() << "!=" << stream.total_out << ")" << endl;
+	abort();
+    }
+
+    err = inflateEnd(&stream);
+    if (err != Z_OK) abort();
+
+    swap(*tag, utag);
+#endif /* COMPRESSED_TAGS */
 }
 
 void
@@ -1422,6 +1590,38 @@
 
     base_letter = ch;
 
+#ifdef COMPRESSED_TAGS
+    // If foo_compress exists then try to compress tags in foo_DB with zlib
+    // using the contents of foo_compress as a dictionary.
+    int fd = ::open((name + "compress").c_str(), O_RDONLY | O_BINARY);
+    compress_strategy = DONT_COMPRESS;
+    if (fd > 0) {
+	compress_strategy = Z_DEFAULT_STRATEGY;
+	compress_dict = sys_read_n_bytes(fd, 65536);
+	::close(fd);
+    }
+    fd = ::open((name + "compress_strategy").c_str(), O_RDONLY | O_BINARY);
+// Look at the first character of foo_compress_strategy:
+//	Z_DEFAULT_STRATEGY for normal data
+// F/f:	Z_FILTERED for data produced by a filter (or predictor)
+// H/h:	Z_HUFFMAN_ONLY to force Huffman encoding only (no string match)
+//// R/r:	Z_RLE to limit match distances to one (run-length encoding)
+    CompileTimeAssert(DONT_COMPRESS != Z_DEFAULT_STRATEGY);
+    CompileTimeAssert(DONT_COMPRESS != Z_FILTERED);
+    CompileTimeAssert(DONT_COMPRESS != Z_HUFFMAN_ONLY);
+//    CompileTimeAssert(DONT_COMPRESS != Z_RLE);
+    if (fd > 0) {
+	string s = sys_read_n_bytes(fd, 1);
+	::close(fd);
+	switch (s[0]) {
+	    case 'F': case 'f': compress_strategy = Z_FILTERED; break;
+	    case 'H': case 'h': compress_strategy = Z_HUFFMAN_ONLY; break;
+//	    case 'R': case 'r': compress_strategy = Z_RLE; break;
+	    default: compress_strategy = Z_DEFAULT_STRATEGY;
+	}
+    }
+#endif /* COMPRESSED_TAGS */
+
     /* ready to open the main file */
 
     return true;
Index: backends/flint/flint_table.h
===================================================================
--- backends/flint/flint_table.h	(revision 7042)
+++ backends/flint/flint_table.h	(working copy)
@@ -22,6 +22,9 @@
 #ifndef OM_HGUARD_FLINT_TABLE_H
 #define OM_HGUARD_FLINT_TABLE_H
 
+// Compress tags with zlib.
+#define COMPRESSED_TAGS
+
 #include <algorithm>
 #include <string>
 using std::string;
@@ -113,7 +116,12 @@
     Item_base_(T p_, int c) : p(p_ + getint2(p_, c)) { }
     Item_base_(T p_) : p(p_) { }
     T get_address() const { return p; }
+#ifdef COMPRESSED_TAGS
+    int size() const { return getint2(p, 0) & 0x7fff; } /* I in diagram above */
+    bool get_compressed() const { return *p & 0x80; }
+#else /* COMPRESSED_TAGS */
     int size() const { return getint2(p, 0); } /* I in diagram above */
+#endif /* COMPRESSED_TAGS */
     int component_of() const {
 	return getint2(p, getK(p, I2) + I2 - C2);
     }
@@ -199,9 +207,16 @@
 	set_component_of(1);
     }
     // FIXME passing cd here is icky
+#ifdef COMPRESSED_TAGS
+    void set_tag(int cd, const char *start, int len, bool compressed) {
+#else /* COMPRESSED_TAGS */
     void set_tag(int cd, const char *start, int len) {
+#endif /* COMPRESSED_TAGS */
 	memmove(p + cd, start, len);
 	set_size(cd + len);
+#ifdef COMPRESSED_TAGS
+	if (compressed) *p |= 0x80;
+#endif /* COMPRESSED_TAGS */
     }
     void fake_root_item() {
 	set_key_len(K1 + C2);   // null key length
@@ -626,6 +641,14 @@
 	 */
 	byte * split_p;
 
+#ifdef COMPRESSED_TAGS
+	/** DONT_COMPRESS or Z_DEFAULT_COMPRESSION, Z_HUFFMAN_ONLY, Z_RLE. */
+	int compress_strategy;
+
+	/** The dictionary (if any) to use for tag compression. */
+	string compress_dict;
+#endif /* COMPRESSED_TAGS */
+
 	/* Debugging methods */
 //	void report_block_full(int m, int n, const byte * p);
 };
