#include "ReadWriteMLA.h"
#include <sys/file.h>
#include <sys/mman.h>
#include "Sys.h"
#include <ctype.h>
#include <errno.h>
#include "db.h"
#include "ixDB.h"
#include "SpellDict.h"

#define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))

ReadWriteMLA::ReadWriteMLA(const char* toc, u_int maxm, u_int avgm, u_int maxms)
   : MLA(toc, maxm, avgm, maxms)
{
    memset(hashtab, 0, sizeof (hashtab));
    oldmsgs = 0;
    sort = FALSE;
    overwrite = FALSE;
    compactPool = TRUE;
    dict = NULL;
}
ReadWriteMLA::ReadWriteMLA(const char* toc) : MLA(toc)
{
    memset(hashtab, 0, sizeof (hashtab));
    oldmsgs = 0;
    sort = FALSE;
    overwrite = FALSE;
    compactPool = FALSE;
    dict = NULL;
}
ReadWriteMLA::~ReadWriteMLA()
{
    delete dict;
}

void ReadWriteMLA::setOverwrite(fxBool b)	{ overwrite = b; }
void ReadWriteMLA::setTOCFile(const char* s)	{ tocFile = s; }
void ReadWriteMLA::setMsgFile(const char* s)	{ msgFile = s; }
void ReadWriteMLA::setIXFile(const char* s)	{ ixFile = s; }
void ReadWriteMLA::setMsgSeparator(const char*s){ msgSeparator = s; }
void ReadWriteMLA::setCompactPool(fxBool b)	{ compactPool = b; }
void ReadWriteMLA::setDefaultAddr(const char* s){ defAddr = s; }
void ReadWriteMLA::setInputFile(const char* s)	{ inputFile = s; }

void
ReadWriteMLA::addStopList(const char* w)
{
    if (ixdb || openIXDB(O_RDWR|O_CREAT))
	ixdb->addStopList(w);
}

void
ReadWriteMLA::setDictionary(const char* file)
{
    if (ixdb || openIXDB(O_RDWR|O_CREAT)) {
	if (dict = SpellDict::open(file))
	    ixdb->setDictionary(dict);
    }
}

void
ReadWriteMLA::setMinWordLength(u_int l)
{
    if (ixdb || openIXDB(O_RDWR|O_CREAT))
	ixdb->setMinWordLength(l);
}

void
ReadWriteMLA::setMaxWordLength(u_int l)
{
    if (ixdb || openIXDB(O_RDWR|O_CREAT))
	ixdb->setMaxWordLength(l);
}

void
ReadWriteMLA::vprintMsg(const char* fmt, va_list ap) const
{
    fputs(inputFile | " (MLA " | getTOCFile() | "): ", stderr);
    vfprintf(stderr, fmt, ap);
    fputs(".\n", stderr);
}

ReadWriteMLA*
ReadWriteMLA::readMLA(const char* file)
{
    int fd;
    if (openMLA(file, O_RDWR, fd)) {
	flock(fd, LOCK_EX);
	ReadWriteMLA* mla = new ReadWriteMLA(file);
	if (mla->setupMLA(fd, PROT_READ|PROT_WRITE, MAP_SHARED)) {
	    mla->oldmsgs = mla->nmsgs;
	    return (mla);
	}
	delete mla;
    }
    return (NULL);
}

/*
 * Hash a string in the string pool that
 * is to be compared only for equality.
 */
inline unsigned poolhash(const char* s)
    { return ((unsigned)s) % HASHSIZE; }

const char*
ReadWriteMLA::stripre(const char* s)
{
    if (isRe(s)) {
	for (s += 3; isspace(*s); s++)
	    ;
    }
    return (s);
}

/*
 * Hash a subject string w/o any leading ``Re:''.
 */
static unsigned
subjhash(const char* s)
{
    unsigned h = 0;
    while (*s)
	h = *s++ + 31*h;
    return (h % HASHSIZE);
}

/*
 * Hash a date string.
 */
static unsigned
datehash(const char* s)
{
    const char* cp;
    for (cp = strchr(s,'\0')-1; cp > s && *cp != ':'; cp--)
	;
    unsigned h = 0;
    for (cp += 2; cp >= s; cp--)
	h = *cp + 31*h;
    return (h % HASHSIZE);
}

/*
 * Compare two date strings up to the timezone part.
 */
static fxBool
datecmp(const char* s1, const char* s2)
{
    while (*s1++ == *s2++)
	if (s1[-1] == ':')
	    return (s1[0] == s2[0] && s1[1] == s2[1]
		&&  s1[2] == s2[2] && s1[3] == s2[3]);
    return (FALSE);
}

/*
 * Search old messages from back to front (newest to oldest)
 * looking for a message that matches an in-reply-to spec.
 *
 * XXX need to bound search to avoid walking the entire db
 */
MailMsg*
ReadWriteMLA::findOldReply(const char* inreply, u_int& subjMatch)
{
    const char* cp = stripre(inreply);
    for (const MailMsg* msg = &table[oldmsgs-1]; msg > table; msg--) {
	if (!strcmp(inreply, offstr(msg->msgid)))
	    return ((MailMsg*) msg);
	if (datecmp(inreply, offstr(msg->date)))
	    return ((MailMsg*) msg);
	if (strncmp(cp, offstr(msg->subject), 10) == 0) {
	    subjMatch = TRUE;
	    return ((MailMsg*) msg);
	}
    }
    return (NULL);
}

/*
 * Locate a message referenced in an "in-reply-to:" field.
 * If the message is located only by matching the Subject:
 * fields then subjMatch is marked.
 */
MailMsg*
ReadWriteMLA::findReply(const char* inreply, u_int& subjMatch)
{
    subjMatch = FALSE;
    mnum_t* ix;
    if (ix = hashtab[poolhash(inreply)]) {	// search by message id
	u_int n = (u_int) *ix++;
	do {
	    const MailMsg& msg = table[*ix];
	    if (inreply == msg.msgid)
		return ((MailMsg*) &msg);
	    ix++;
	} while (--n);
    }
    if (ix = hashtab[datehash(inreply)]) {	// search by date string
	u_int n = (u_int) *ix++;
	do {
	    const MailMsg& msg = table[*ix];
	    if (datecmp(inreply, msg.date))
		return ((MailMsg*) &msg);
	    ix++;
	} while (--n);
    }
    if (ix = hashtab[subjhash(inreply)]) {	// search by subject
	u_int n = (u_int) *ix++;
	do {
	    const MailMsg& msg = table[*ix];
	    if (inreply == msg.subject || !strcmp(inreply, msg.subject)) {
		subjMatch = TRUE;
		return ((MailMsg*) &msg);
	    }
	    ix++;
	} while (--n);
    }
    return (oldmsgs ? findOldReply(inreply, subjMatch) : NULL);
}

/*
 * Add an entry to the hash table.  Each hash table entry
 * is an array of message indices.  Each array ``a'' always
 * the number of used entries in a[0] and the array size
 * always having space for a multiple of four indices.
 */
void
ReadWriteMLA::addHashEntry(u_int h, const MailMsg& msg)
{
    mnum_t* ix;
    u_int n = 1;
    if (ix = hashtab[h]) {
	n += ix[0];
	if ((n&3) == 0) {
	    ix = (mnum_t*) realloc(ix, 1+(n+4) * sizeof (mnum_t));
	    fxAssert(ix != NULL, "No memory to expand hash table (realloc)");
	    hashtab[h] = ix;
	}
    } else {
	ix = (mnum_t*) malloc(4*sizeof (mnum_t));
	fxAssert(ix != NULL, "No memory to expand hash table (malloc)");
	hashtab[h] = ix;
    }
    ix[0] = n;
    ix[n] = msg.msgnum;
}

void
ReadWriteMLA::purgeHashTable(void)
{
    for (u_int i = 0; i < HASHSIZE; i++)
	if (hashtab[i])
	    free(hashtab[i]);
    memset(hashtab, 0, sizeof (hashtab));
    oldmsgs = nmsgs;			// reset w/ hash table purged
}

/*
 * Setup the parent for a message and add this
 * message as a reply in the parent's reply list.
 */
void
ReadWriteMLA::setParent(MailMsg& msg, MailMsg& parent, const char* inreply)
{
    msg.parent = parent.msgnum+1;
    msg.thread = parent.thread;
    addReply(parent, msg);
    if (trace > 1) {
	printf("add reply %d for msg %d, in-reply-to \"%s\", ",
	    msg.msgnum, parent.msgnum, inreply);
	printReplyNums(stdout, parent);
	putc('\n', stdout);
    }
}

fxBool
ReadWriteMLA::subjcmp(const MailMsg& m1, const MailMsg& m2)
{
    return strncmp(stripre(m1.subject), stripre(checkstr(m2.subject)), 10) == 0;
}

/*
 * Enter a message into the hashed database.  The message
 * is setup so that it can be retrieved by date, message id,
 * or subject.  Cross-indexing of replies is also done.
 */
void
ReadWriteMLA::addMessage(MailMsg& msg, const char* inreply, u_int line)
{
    if (trace > 2)
	msg.print(stdout, *this);

    MailMsg* parent;
    if (inreply != nullstr) {
	/*
	 * Search for the message this is a reply to.  Note
	 * that do this before entering the message below so
	 * that lookups never find this message.
	 */
	parent = findReply(inreply, msg.maybereply);
	if (parent) {
	    /*
	     * If a parent message was located, check the subject
	     * lines.  If someone replies to a previous message in order
	     * to get the return address, but the followup message is
	     * totally unrelated to the original then we get false
	     * relationships.  However if we reject linkage because
	     * of subject mismatches then folks that edit the subject
	     * line (e.g. to add additional information) will not be
	     * included in a thread.  Experience indicates the first
	     * problem occurs much less than the second (and those
	     * people get what they deserve!)
	     */
	    if (!msg.maybereply && !subjcmp(msg, *parent))
		warning("possibly unrelated reply for msg %u, line %u",
		    msg.msgnum, line);
	} else if (isRe(msg.subject)) {
	    /*
	     * A parent message was not found, typically because the
	     * original message was cc'd to the archive.  In this case
	     * the in-reply-to field for the message has the message id 
	     * of the message sent to the person.  Our only recourse is
	     * to search for the subject line.
	     */
	    const char* subj = stripre(msg.subject);
	    parent = findReply(subj, msg.maybereply);	// 1st for original msg
	    if (!parent)				// 2nd for a reply
		parent = findReply(msg.subject, msg.maybereply);
	    if (parent)
		inreply = subj;
	}
	if (!parent)
	    warning("no reply found for msg %u, line %u, in-reply-to \"%s\"",
		msg.msgnum, line, inreply);
    } else
	parent = NULL;
    /*
     * Add new message to hash table.
     */
    addHashEntry(poolhash(msg.msgid), msg);	// by message id
    addHashEntry(subjhash(msg.subject), msg);	// by subject
    addHashEntry(datehash(msg.date), msg);	// by date w/o timezone
    /*
     * If messages are inserted out of order, then the table
     * will have to be sorted to insure the query logic works.
     */
    fxAssert(nmsgs < maxmsgs, "Sorted message table overflow");
    ((mnum_t*) sorted)[nmsgs] = msg.msgnum;
    if (nmsgs > 0 && msg.datetime < table[nmsgs-1].datetime && !sort) {
	warning("out of order messages, first one is msg %u at line %u",
	    msg.msgnum, line);
	sort = TRUE;
    }
    if (!parent) {
	/*
	 * Message starts a new thread, assign it a thread number
	 * and enter it into the thread table.
	 */
	fxAssert(nthreads < maxthreads, "Thread table overflow");
	msg.thread = nthreads++;
	((mnum_t*) threads)[msg.thread] = msg.msgnum;
    } else
	setParent(msg, *parent, inreply);
}

#define	COPY(d, space)	memmove(d+newoff, d+oldoff, (space))
#define	ADJUST(o,n,t)	(newoff -= (n)*sizeof (t), oldoff -= (o)*sizeof (t))
/*
 * Grow the database.
 */
void
ReadWriteMLA::grow(void)
{
    warning("growing database, "
	"msgs [%u:%u] threads [%u:%u] replies [%u:%u] strings [%u:%u]",
	nmsgs, maxmsgs, nthreads, maxthreads, nreplies, maxreplies,
	strspace, maxstrspace);
    /*
     * Force everything to disk and purge all in-memory
     * references that will be invalid when the expanded
     * file is grown below.
     */
    update();				// NB: also converts msg descriptors
    purgeStringPool();			// clear string hash table
    purgeHashTable();			// clear references to flushed msgs
    /*
     * Calculate the current usage patterns for the fixed size
     * tables and try to balance things so that if the usage
     * patterns continue the tables will all be filled at about
     * the same time.  Note that the recalculated reply spillover
     * table and string pool are forced to 32-bit boundaries;
     * this is consistent with calculations done elsewhere and
     * with assumptions in other parts of the library.
     */
    u_int newmaxmsgs;
    if (nmsgs < maxmsgs) {		// insure at least 25% free
	if ((100*nmsgs/maxmsgs) > 75)
	    newmaxmsgs = maxmsgs + (nmsgs/4);
	else
	    newmaxmsgs = maxmsgs;
    } else				// overflow, expand by 25%
	newmaxmsgs = 4*maxmsgs/3;
    /*
     * Calculate the new values for the max threads,
     * replies, and string space.  We do this using
     * double-precision floating point to avoid 32-bit
     * integer overflow problems (we don't need exact
     * answers).
     */ 
#define	fmuldiv(a,b,c)	(u_int) ((double(a) * double(b)) / double(c))
    u_int newmaxthreads = fmuldiv(newmaxmsgs, nthreads, nmsgs);
    u_int newmaxreplies = roundup(fmuldiv(newmaxmsgs, nreplies, nmsgs), 4);
    u_int newmaxstrspace = roundup(fmuldiv(newmaxmsgs, strspace, nmsgs), 4);

    MLAHeader* h = (MLAHeader*) data;	// XXX for calculating header size
    /*
     * Grow the file and copy the old data.
     */
    if (fd == -1) {			// no mmap'd file
	fxAssert(FALSE, "Can't grow un-mmap'd database yet!");
    } else {				// db is mmap'd
	/*
	 * Calculate file growth and touch the last byte in the
	 * file to force it to grow; then remap the file and
	 * copy the old data to the new locations.  Note that
	 * this technique basically eliminates any holes that
	 * might be in a file (except at the very end).
	 */
	off_t newspace =
	      (newmaxmsgs - maxmsgs) * sizeof (MailMsg)
	    + (newmaxthreads - maxthreads) * sizeof (mnum_t)
	    + (newmaxreplies - maxreplies) * sizeof (mnum_t)
	    + (newmaxmsgs - maxmsgs) * sizeof (mnum_t)
	    + (newmaxstrspace - maxstrspace)
	    ;
	/*
	 * Flush old mapping.
	 */
	(void) munmap((char*) data, size);
	/*
	 * NB: if an error occurs in the following steps the
	 *     database will still be in a consistent state
	 *     (that's why it's ``ok'' to just call fatal).
	 */
#ifdef notdef
	if (lseek(fd, size+newspace, SEEK_SET) == -1)
	    fatal("Could not extend file (lseek: %s)", strerror(errno));
	char zero = 0;
	if (write(fd, &zero, 1) != 1)
	    fatal("Could not extend file (write: %s)", strerror(errno));
#else
	if (ftruncate(fd, size+newspace) == -1)
	    fatal("Could not extend file (ftruncate: %s)", strerror(errno));
#endif
	/*
	 * Create a new mapping for the extended file.
	 */
	char* newdata = (char*)
	    mmap(NULL, size+newspace, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
	if (newdata == (char*) -1)
	    fatal("Unable to mmap grown file (mmap: %s)", strerror(errno));
	/*
	 * Copy old data back to front to new locations.
	 * Note that we (cleverly) put the largest data
	 * structure at the front so it can grow without
	 * being copied.
	 */
	off_t newoff =			// offset to string pool in new map
	      (&h->msgs[0] - data)
	    + newmaxmsgs * sizeof (MailMsg)
	    + newmaxthreads * sizeof (mnum_t)
	    + newmaxreplies * sizeof (mnum_t)
	    + newmaxmsgs * sizeof (mnum_t)
	    ;
	off_t oldoff = (off_t)((const char*) strpool - data);
	COPY(newdata, strspace);			// string pool
	ADJUST(maxmsgs, newmaxmsgs, mnum_t);
	COPY(newdata, nmsgs * sizeof (mnum_t));		// sorted message table
	ADJUST(maxreplies, newmaxreplies, mnum_t);
	COPY(newdata, nreplies * sizeof (mnum_t));	// reply spillover table
	ADJUST(maxthreads, newmaxthreads, mnum_t);
	COPY(newdata, nthreads * sizeof (mnum_t));	// thread table

	data = newdata,	size += newspace;		// switch to new map
    }
    h = (MLAHeader*) data;
    h->maxmsgs = newmaxmsgs;
    h->maxthreads = newmaxthreads;
    h->maxreplies = newmaxreplies;
    h->maxstrspace = newmaxstrspace;

    loadMLA();				// reload to setup new pointers
}
#undef	ADJUST
#undef	COPY

static const MLA* sortmla;		// XXX for qsort

static int
dtcmp(const void* a, const void* b)
{
    const MailMsg& ma = sortmla->getMsgTable()[*(mnum_t*)a];
    const MailMsg& mb = sortmla->getMsgTable()[*(mnum_t*)b];
    return (ma.datetime - mb.datetime);
}

/*
 * Update the database.
 */
fxBool
ReadWriteMLA::update()
{
    if (msgdb)
	(*msgdb->sync)(msgdb, 0);
    if (ixdb)
	ixdb->Sync();
    if (sort) {
	sortmla = this;			// XXX
	qsort((mnum_t*)sorted, nmsgs, sizeof (mnum_t), dtcmp);
    }
    MLAHeader& h = *(MLAHeader*) data;
    h.nmsgs = nmsgs;			// total number of messages
    h.nthreads = nthreads;		// number of threads
    h.nreplies = nreplies;		// size of reply table

    const char* ixfile = hashstr(ixFile);
    const char* msgfile = hashstr(msgFile);
    const char* sep = hashstr(msgSeparator);
    char* temp;
    if (compactPool) {
	/*
	 * Convert all old string offsets to pointers so
	 * so that when we compact the pool the offsets
	 * will be recalculated using the compacted pool.
	 */
	for (u_int i = 0; i < oldmsgs; i++) {
	    MailMsg& msg = *(MailMsg*)&table[i];
	    fxAssert(isoffset(msg.name), "Bogus old message descriptor");
	    msg.name = getstr((off_t) msg.name);
	    msg.emailaddr = getstr((off_t) msg.emailaddr);
	    msg.replytoaddr = getstr((off_t) msg.replytoaddr);
	    msg.to = getstr((off_t) msg.to);
	    msg.date = getstr((off_t) msg.date);
	    msg.subject = getstr((off_t) msg.subject);
	    msg.msgid = getstr((off_t) msg.msgid);
	}
	oldmsgs = 0;			// force all msgs to be converted below
	/*
	 * Build a compacted copy of the string pool.  As
	 * a side effect string offsets in the hash buckets
	 * are set to reference the new (compacted) pool;
	 * these offset are then copied back below when
	 * the message descriptors are re-processed.
	 */
	temp = (char*) malloc(strspace);
	if (temp != NULL)
	    compact(temp, strcount, strspace);
	else
	    warning("No space to compact string pool");
    }
    h.strcount = strcount;
    h.strspace = strspace;

    h.msgseparator = stroff(sep);	// inter-message pattern string
    h.msgfile = stroff(msgfile);	// messages file
    h.ixfile = stroff(ixfile);		// optional keyword inverted index file

    /*
     * Convert message descriptor strings from
     * string pool references to pool offsets.
     */
    for (u_int i = oldmsgs; i < nmsgs; i++) {
	MailMsg& msg = *(MailMsg*)&table[i];
	msg.name = (const char*) stroff(msg.name);
	msg.emailaddr = (const char*) stroff(msg.emailaddr);
	msg.replytoaddr = (const char*) stroff(msg.replytoaddr);
	msg.to = (const char*) stroff(msg.to);
	msg.date = (const char*) stroff(msg.date);
	msg.subject = (const char*) stroff(msg.subject);
	msg.msgid = (const char*) stroff(msg.msgid);
    }
    if (compactPool && temp != NULL) {
	/*
	 * Complete string pool compaction work by writing
	 * the compacted string data over top of the old
	 * data.  All string offsets should have already been
	 * update above to reflect the contents of the new
	 * pool of strings.
	 */
	memcpy((char*) strpool, temp, strspace);
	free(temp);
    }
    fxBool status = TRUE;
    if (fd == -1) {			// no mmap'd file, must use write
	// XXX optimize update to write only new stuff
	fd = Sys::open(tocFile, O_WRONLY|O_CREAT, 0644);
	if (fd >= 0) {
	    flock(fd, LOCK_EX);
	    if (write(fd, data, size) != size) {
		perror("write");
		status = FALSE;
	    }
	    close(fd), fd = -1;
	} else {
	    perror(tocFile);
	    status = FALSE;
	}
    }
    return (status);
}

/*
 * Add msg as a reply to parent.
 */
void
ReadWriteMLA::addReply(MailMsg& parent, MailMsg& msg)
{
    u_int n = parent.nreplies;			// # of existing replies
    parent.nreplies++;
    if (n == 0) {				// simple case, 1st goes inline
	parent.replynum = msg.msgnum;
	return;
    }
    /*
     * Must use spillover table.  If space was previously
     * allocated, append to it first.  If there isn't enough
     * space in an existing block, expand to a new block.
     * We use a linked list of fixed-size spillover blocks
     * so that we don't disturb existing indices into the table.
     */
    mnum_t* prev;				// link to patch for new block
    if (n > 1) {
	mnum_t* ix = (mnum_t*) &replies[parent.replynum];
	for (; n > 4; n -= 3)			// chain to the last block
	    ix = (mnum_t*) &replies[ix[3]];
	if (n < 4) {				// space in existing block
	    ix[n] = msg.msgnum;
	    return;
	}
	prev = &ix[3];				// always patch last entry
    } else					// no previous blocks
	prev = &parent.replynum;		// patch link in message
    /*
     * Add one or more new spillover blocks
     * and fill them with the replies.
     */
    fxAssert(nreplies < maxreplies, "Reply spillover table overflow");
    mnum_t* rp = (mnum_t*) replies;
    rp[nreplies+0] = *prev;			// displaced reply
    rp[nreplies+1] = msg.msgnum;		// new reply
    *prev = (mnum_t) nreplies;			// patch link to new block
    nreplies += 4;				// advance past allocated block
}
