#include "MLA.h"
#include "ixDB.h"
#include "Sys.h"
#include "db.h"
#include <sys/mman.h>
#include <errno.h>

#define	N(a)	(sizeof (a) / sizeof (a[0]))
#define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))

inline const void* address(const void* p, size_t off)
    { return (const void*) (((const char*) p) + off); }

const char* MLA::nullstr = "";

/*
 * Create a new MLA database using the specified parameters
 * to calculate the initial data structure sizes.  Table sizes
 * are based on some assumptions that reflect common usage:
 *
 * o the # threads is 1/3 # of messages (i.e. 2 replies for every posting)
 * o the size of the reply spillover table is 1/6 # of messages
 *   (the expectation is that most messages get a single reply
 *   that can be stored in the message descriptor)
 * o string pool gets an average message size (user specified)
 *   for 1/2 the max # messages; since most messages are replies
 *   there will be significant sharing of strings (i.e. subject,
 *   mail addresses, user names)
 *
 * If the tables overflow then the recalculated figures reflect
 * the usage history from messages already entered.
 */
MLA::MLA(const char* toc, u_int maxmsg, u_int avgmsgsize, u_int maxmsize)
    : tocFile(toc)
    , msgFile(MLA_MSGNAME)
    , ixFile(MLA_IXNAME)
    , msgSeparator(MLA_MSGSEP)
{
    fd = -1;
    nmsgs = 0;				// number messages in table
    maxmsgs = maxmsg;			// size of message table
    nthreads = 0;			// number of threads in table
    maxthreads = maxmsgs/3;		// size of thread table
    nreplies = 0;			// number of entries in spillover table
    maxreplies = roundup(maxmsgs/3, 4);	// size of spillover table
    memset(buckets, 0, sizeof (buckets));

    strspace = 1;			// space used in pool
					// max space for string pool
    maxstrspace = roundup((3*maxmsg/5)*avgmsgsize,4);
    strcount = 0;			// count of pooled strings
    strhashtab = estrhashtab = strhashp = NULL;

    maxmsgsize = maxmsize;

    createMLA();

    trace = 0;				// tracing
    msgdb = NULL;			// message body database
    ixdb = NULL;			// optional inverted keyword index db
}
MLA::MLA(const char* toc)
    : tocFile(toc)
{
    fd = -1;
    data = NULL;
    memset(buckets, 0, sizeof (buckets));
    trace = 0;
    msgdb = NULL;			// message body database
    ixdb = NULL;			// optional inverted keyword index db
    // NB: everything else is expected to be read from the database
}
MLA::~MLA()
{
    delete ixdb;
    if (msgdb != NULL)
	(*msgdb->close)(msgdb);
    if (fd != -1) {			// data was mmap'd
	munmap((char*) data, size);
	close(fd);
    } else {				// data was read into malloc'd space
	delete (void*) data;
    }
    purgeStringPool();
}

void
MLA::vprintMsg(const char* fmt, va_list ap) const
{
    fputs(getTOCFile() | ": ", stderr);
    vfprintf(stderr, fmt, ap);
    fputs(".\n", stderr);
}

void
MLA::warning(const char* fmt ...) const
{
    fflush(stdout);
    va_list ap;
    va_start(ap, fmt);
    vprintMsg(fxStr("Warning, ") | fmt, ap);
    va_end(ap);
}

void
MLA::error(const char* fmt ...) const
{
    fflush(stdout);
    va_list ap;
    va_start(ap, fmt);
    vprintMsg(fmt, ap);
    va_end(ap);
}

void
MLA::fatal(const char* fmt ...) const
{
    fflush(stdout);
    va_list ap;
    va_start(ap, fmt);
    vprintMsg(fmt, ap);
    va_end(ap);
    exit(-1);
}

/*
 * Create initial in-memory data structures.
 */
void
MLA::createMLA(void)
{
    size = sizeof (MLAHeader)
	+ maxmsgs * sizeof (MailMsg)	// message table
	+ maxthreads * sizeof (mnum_t)	// thread table
	+ maxreplies * sizeof (mnum_t)	// reply spillover table
	+ maxmsgs * sizeof (mnum_t)	// sorted message table
	+ maxstrspace			// string pool
	;
    char* cp = new char[size];
    memset(cp, 0, size);
    data = cp;
    MLAHeader& h = *(MLAHeader*) data;
    h.magic = MLA_MAGIC;
    h.version = MLA_MAJOR;
    h.maxmsgs = maxmsgs;		// max size of msg-oriented tables
    h.maxthreads = maxthreads;		// max number of threads
    h.maxreplies = maxreplies;		// max size of reply table
    h.maxstrspace = maxstrspace;	// max size of string pool
    h.maxmsgsize = maxmsgsize;
    table = (const MailMsg*) &h.msgs[0];// message table
    threads = (const mnum_t*) address(table, maxmsgs*sizeof (MailMsg));
    replies = (const mnum_t*) address(threads, maxthreads*sizeof (mnum_t));
    sorted = (const mnum_t*) address(replies, maxreplies*sizeof (mnum_t));
    strpool = (const char*) address(sorted, maxmsgs*sizeof (mnum_t));
}

void MLA::setTrace(u_int t)		{ trace = t; }

fxBool
MLA::openMLA(const char* file, int flags, int& fd)
{
    fd = Sys::open(file, flags, 0644);
    if (fd < 0) {
	perror(file);
	return (FALSE);
    }
    u_short magic;
    if (read(fd, &magic, sizeof (magic)) != sizeof (magic) ||
      magic != MLA_MAGIC) {
	// XXX can't use error message interface
	fprintf(stderr, "%s: not an MLA database, bad magic number.\n", file);
	close(fd);
	return (FALSE);
    }
    return (TRUE);
}

fxBool
MLA::setupMLA(int fd0, int prot, int share)
{
    fd = fd0;
    struct stat sb;
    (void) Sys::fstat(fd, sb);
    size = sb.st_size;
    data = (const char*) mmap(NULL, size, prot, share, fd, 0);
    if (data == (const char*) -1) {
	char* cp = new char[size];
	*(u_int*) cp = MLA_MAGIC;
	sb.st_size -= sizeof (u_int);
	if (::read(fd, cp+sizeof (u_int), sb.st_size) != sb.st_size) {
	    error("setupMLA: read: %s", strerror(errno));
	    delete cp;
	    return (FALSE);
	}
	close(fd), fd = -1;
	data = cp;
    }
    loadMLA();
    return (TRUE);
}

const char* MLA::getstr(off_t off)
    { return hashstr(strpool+off, FALSE); }

/*
 * Load initial data structures from the mmap'd/read data.
 */
void
MLA::loadMLA(void)
{
    const MLAHeader& h = *(const MLAHeader*) data;
    nmsgs = h.nmsgs;			// total number of messages
    maxmsgs = h.maxmsgs;		// max size of msg-oriented tables
    nthreads = h.nthreads;		// number of threads
    maxthreads = h.maxthreads;		// max number of threads
    nreplies = h.nreplies;		// size of reply table
    maxreplies = h.maxreplies;		// max size of reply table
    strcount = h.strcount;		// existing strings
    strspace = h.strspace;		// existing string pool space
    maxstrspace = h.maxstrspace;	// max space for string pool
    maxmsgsize = h.maxmsgsize;		// message body spillover point

    table = (const MailMsg*) &h.msgs[0];
    threads = (const mnum_t*) address(table, maxmsgs*sizeof (MailMsg));
    replies = (const mnum_t*) address(threads, maxthreads*sizeof (mnum_t));
    sorted = (const mnum_t*) address(replies, maxreplies*sizeof (mnum_t));
    strpool = (const char*) address(sorted, maxmsgs*sizeof (mnum_t));

    fxAssert(strpool+maxstrspace - (const char*) data <= size,
	"Invalid data structure sizes; file size too small for header sizes");

    strhashtab = new strhash[strcount];	// allocate in one chunk
    estrhashtab = &strhashtab[strcount];
    strhashp = strhashtab;

    msgSeparator = getstr(h.msgseparator);// inter-message pattern for parsing
    msgFile = getstr(h.msgfile);	// message database filename
    ixFile = getstr(h.ixfile);		// optional keyword index filename
}

/*
 * Reclaim all the resources used in
 * the string pool's hash table.
 */
void
MLA::purgeStringPool()
{
    for (int h = 0; h < N(buckets); h++) {
	strhash* next;
	for (strhash* hp = buckets[h]; hp; hp = next) {
	    next = hp->next;
	    if (hp < strhashtab || estrhashtab <= hp)
		delete hp;
	}
    }
    memset(buckets, 0, sizeof (buckets));
    delete strhashtab;
    strhashtab = estrhashtab = NULL;
}

/*
 * Return a reference to the specified string in
 * the string pool.  If the string is already in
 * the pool, either as the full string or as a
 * substring of a larger string, then return the
 * existing entry.  Otherwise, make a new entry
 * in the pool and return that.
 */
const char*
MLA::hashstr(const char* s, fxBool isnew)
{
    if (*s == '\0')
	return (nullstr);
    u_int len = strlen(s);
    int h = s[len-1] & 0177;
    int d;
    strhash* hp;
    for (hp = buckets[h]; hp; hp = hp->next) {
	d = hp->len - len;
	if (d >= 0 && strcmp(hp->data+d, s) == 0)
	    return (hp->data+d);
    }
    d = 0;
    if (isnew) {
	fxAssert(strspace+len+1 < maxstrspace, "String pool overflow");
	strcount++;
	hp = new strhash;
	hp->off = strspace;
	hp->data = strpool + hp->off;
	strspace += len+1;
	memcpy((char*) hp->data, s, len+1);
    } else {
	fxAssert(strhashp < estrhashtab, "String hash table overflow");
	/*
	 * String is already in the string pool.  Locate the
	 * start of the string so that we share hash buckets
	 * as intended.  Finding the start of the string is
	 * simple because we know the pool is packed with each
	 * string null-terminated and the first string in the
	 * pool is the null string.  Thus to find the start of
	 * the string we just back up until we see a null.
	 */
	if (s[-1] != '\0') {
	    do {
		s--, d++;
	    } while (s[-1] != '\0');
	    len += d;				// recalculate string length ...
	    h = s[len-1] & 0177;		// ... and bucket hash
	}
	(hp = strhashp++)->data = s;
	hp->off = s - strpool;
    }
    hp->len = len;
    hp->next = buckets[h];
    buckets[h] = hp;
    return (hp->data+d);
}

/*
 * Return the offset in the string pool for
 * the requested string.  Offsets start at 1
 * so that offset zero can be used for the
 * null string. 
 */
off_t
MLA::stroff(const char* s) const
{
    if (*s != '\0') {
	u_int len = strlen(s);
	for (strhash* hp = buckets[s[len-1] & 0177]; hp; hp = hp->next) {
	    unsigned d = s-hp->data;
	    if (d < hp->len)
		return (hp->off+d);
	}
	error("Offset requested for unhashed string \"%s\"", s);
    }
    return (0);			// nullstr
}

/*
 * Compact the string pool into the provided space.
 */
void
MLA::compact(char* cp, u_int& sc, u_int& ss)
{
    off_t off = 1;
    u_int count = 1;
    *cp = '\0';				// set null string at offset 0
    for (u_int i = 0; i < N(buckets); i++)
	for (strhash* hp = buckets[i]; hp; hp = hp->next) {
	    /*
	     * We search backward for a better string since we
	     * know that any previous string would have been
	     * selected when the string was originally entered.
	     * This is also important because we know any better
	     * string will already have an assigned offset in
	     * the string pool that we can copy.
	     */
	    strhash* best = hp;
	    for (strhash* tp = buckets[i]; tp != hp; tp = tp->next) {
		int d = tp->len - hp->len;
		if (d > 0 && strcmp(&tp->data[d], hp->data) == 0)
		    if (tp->len > best->len)
			best = tp;
	    }
	    if (best == hp) {
		hp->off = off;
		memcpy(cp+off, hp->data, hp->len+1);
		off += hp->len+1;
		count++;
	    } else
		hp->off = best->off + (best->len - hp->len);
	}
    ss = off;		// revised size
    sc = count;		// revised string count
}

// strip leading+trailing whitespace+quote marks from a string

void
MLA::trimWS(fxStr &s)
{
    u_int l;
    if (l = s.skip(0, " \t"))
	s.remove(0, l);
    if ((l = s.skipR(s.length(), " \t\n")) != s.length())
	s.resize(l);
    // only strip quotes/parens that surround the entire string
    if ((l = s.length()) > 1) {
	if ((s[0] == '"' && s[l-1] == '"') || (s[0] == '(' && s[l-1] == ')')) {
	    s = s.extract(1, l-2);
	    trimWS(s);
	}
    }
}

inline u_int pct(u_int a, u_int b) { return (b == 0 ? 0 : 100*a/b); }

void
MLA::printHeader(FILE* fp) const
{
    const MLAHeader& h = *(const MLAHeader*)data;
    fprintf(fp, "magic:   %x    version: %d\n", h.magic, h.version);
    fprintf(fp, "msgs:    %u of %u (%u%%)\n",
	nmsgs, maxmsgs, pct(nmsgs, maxmsgs));
    fprintf(fp, "threads: %u of %u (%u%%)\n",
	nthreads, maxthreads, pct(nthreads, maxthreads));
    fprintf(fp, "replies: %u of %u (%u%%)\n",
	nreplies, maxreplies, pct(nreplies, maxreplies));
    fprintf(fp, "strings: %.1fK of %.1fK (%u%%) (%u strings)\n",
	strspace/1024., maxstrspace/1024.,
	pct(strspace, maxstrspace), strcount);
    fprintf(fp, "msgsep:  \"%s\"\n", (const char*) msgSeparator);
    fprintf(fp, "maxsize: %u (msg spillover to file)\n", maxmsgsize);
    fprintf(fp, "msgfile: \"%s\"\n", (const char*) msgFile);
    fprintf(fp, "ixfile:  \"%s\"\n", (const char*) ixFile);
}

void
MLA::printReplyNums(FILE* fd, const MailMsg& msg) const
{
    fprintf(fd, "replies = [");
    const char* sep = "";
    if (msg.nreplies > 1) {
	const mnum_t* ix = &replies[msg.replynum];
	u_int n = msg.nreplies;
	for (u_int i = 0; n > 4; i += 3, n -= 3) {
	    fprintf(fd, "%s%d", sep, table[ix[0]].msgnum);
	    fprintf(fd, ",%d", table[ix[1]].msgnum);
	    fprintf(fd, ",%d", table[ix[2]].msgnum);
	    sep = ",";
	    ix = &replies[ix[3]];
	}
	switch (n) {
	case 4: fprintf(fd, "%s%d", sep, table[*ix++].msgnum), sep = ",";
	case 3: fprintf(fd, "%s%d", sep, table[*ix++].msgnum), sep = ",";
	case 2: fprintf(fd, "%s%d", sep, table[*ix++].msgnum), sep = ",";
	case 1: fprintf(fd, "%s%d", sep, table[*ix].msgnum), sep = ",";
	}
    } else
	fprintf(fd, "%d", msg.replynum);
    fprintf(fd, "]");
}

fxBool
MLA::openMsgDB(u_int flags)
{
    msgdb = dbopen(msgFile, flags, 0644, DB_HASH, NULL);
    if (!msgdb) {
	error("Could not open article database %s", (const char*) msgFile);
	return (FALSE);
    } else
	return (TRUE);
}

fxBool
MLA::openIXDB(u_int flags)
{
    if (flags == O_RDONLY && !Sys::isRegularFile(ixFile))
	return (FALSE);
    DB* db = dbopen(ixFile, flags, 0644, DB_HASH, NULL);
    if (!db) {
	error("Could not open index database %s", (const char*) ixFile);
	return (FALSE);
    }
    ixdb = new ixDB(db);
    ixdb->setTracing(trace);
    return (TRUE);
}

void
MLA::decode(FILE* fout, z_stream& zstream, const MailMsg& msg) const
{
    char obuf[16*1024];
    zstream.next_out = (Bytef*) obuf;
    zstream.avail_out = sizeof (obuf);
    do {
	int state = inflate(&zstream, Z_PARTIAL_FLUSH);
	if (state == Z_STREAM_END)
	    break;
	if (state != Z_OK) {
	    error("Message %u: Decoding error; %s", msg.msgnum, zstream.msg);
	    break;
	}
	size_t occ = sizeof (obuf) - zstream.avail_out;
	if (fwrite(obuf, occ, 1, fout) != 1) {
	    error("Message %u: Error writing output file", msg.msgnum);
	    return;
	}
	zstream.next_out = (Bytef*) obuf;
	zstream.avail_out = sizeof (obuf);
    } while (zstream.avail_in > 0);
    if (zstream.avail_out != sizeof (obuf)) {
	size_t occ = sizeof (obuf) - zstream.avail_out;
	if (fwrite(obuf, occ, 1, fout) != 1)
	    error("Message %u: Error writing output file", msg.msgnum);
    }
}

void
MLA::printMsgBody(FILE* fp, const MailMsg& msg) const
{
    // NB: this is why const is usually a bad idea...
    if (msgdb || ((MLA*) this)->openMsgDB(O_RDONLY)) {
	DBT key;
	key.data = (void*) &msg.msgnum;
	key.size = sizeof (msg.msgnum);
	DBT data;
	switch ((*msgdb->get)(msgdb, &key, &data, 0)) {
	case -1:
	    error("Unable to \"get\" msg %u, errno %u", msg.msgnum, errno);
	    return;
	case 1:
	    error("Msg %u not found in database", msg.msgnum);
	    return;
	}
	const char* cp = (const char*) data.data;
	int type = *cp++;				// type of data
	z_stream zstream;
	zstream.zalloc = NULL;
	zstream.zfree = NULL;
	zstream.opaque = NULL;
	zstream.data_type = Z_BINARY;
	if (inflateInit(&zstream) == Z_OK) {
	    if (type >= 10) {
		type -= 10;				// reduce to base type
		int fd = Sys::open(cp, O_RDONLY);
		if (fd >= 0) {
		    struct stat sb;
		    (void) Sys::fstat(fd, sb);
		    zstream.avail_in = sb.st_size;
		    cp = (const char*)
			mmap(NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0);
		    if (cp == (const char*) -1) {
			u_char* xp = new u_char[sb.st_size];
			if (read(fd, xp, sb.st_size) == sb.st_size) {
			    zstream.next_in = (Bytef*) xp;
			    decode(fp, zstream, msg);
			} else
			    error("Unable to read spillover msg body from"
				" file \"%s\", msg %u, errno %d",
				cp, msg.msgnum, errno);
			delete xp;
		    } else {
			zstream.next_in = (Bytef*) cp;
			decode(fp, zstream, msg);
			munmap((char*) cp, sb.st_size);
		    }
		    close(fd);
		} else
		    error("Unable to open spillover file \"%s\""
			" for msg %u, errno %d", cp, msg.msgnum, errno);
	    } else {
		zstream.next_in = (Bytef*) cp;
		zstream.avail_in = data.size-1;
		decode(fp, zstream, msg);
	    }
	    inflateEnd(&zstream);
	}
    }
}
