/*
 * lat_ctx.c - context switch timer 
 *
 * usage: lat_ctx [-s size] #procs [#procs....]
 *
 * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
 * additional restriction that results may published only if
 * (1) the benchmark is unmodified, and
 * (2) the version in the sccsid below is included in the report.
 * Support for this development by Sun Microsystems is gratefully acknowledged.
 */
char	*id = "$Id: lat_ctx.c,v 1.1 1994/11/18 08:49:48 lm Exp $\n";

#include "timing.c"
/*
 * You may not report numbers with this define turned on.
 */
/*#define	VIRTUAL_ADDR_CACHE	/*Sun internal only*/

#define	TRIES	12
#define	WRITES	2000

int	process_size, *data;	/* size & pointer to an array that big */
int	pids[100];
int	pipe_cost(), overhead(), ctx(), sumit();
void	doit();

int
main(ac, av)
	int	ac;
	char	**av;
{
	int	i;
	int	tries;
	int	result;
	int	min;
	int	overhead;

	write(2, id, strlen(id));
	if (ac < 2) {
usage:		printf("Usage: %s [-s kbytes] processes [processes ...]\n",
		    av[0]);
	}

	/*
	 * If they specified a context size, get it.
	 */
	if (!strcmp(av[1], "-s")) {
#ifdef	VIRTUAL_ADDR_CACHE
		int	nprocs = 0;
#endif

		if (ac < 3) {
			goto usage;
		}
#ifdef	VIRTUAL_ADDR_CACHE
		for (i = 3; i < ac; ++i) {
			if (atoi(av[i]) > nprocs) {
				nprocs = atoi(av[i]);
			}
		}
#endif
		process_size = atoi(av[2]) * 1024;
#ifdef	VIRTUAL_ADDR_CACHE
		data = (int *)malloc(process_size * nprocs);
#else
		data = (int *)malloc(process_size);
#endif
		ac -= 2;
		av += 2;
	}

	overhead = pipe_cost();

	fprintf(stderr, "\n\"size=%d ovr=%d\n", process_size/1024, overhead);

	for (i = 1; i < ac; ++i) {
		min = 0x7fffffff;
		for (tries = 0; tries < TRIES; ++tries) {
			result = ctx(overhead, WRITES, atoi(av[i]));
			if (min > result) {
				min = result;
			}
		}
	    	fprintf(stderr, "%d %d\n", atoi(av[i]), min);
	}
	return (0);
}

int
ctx(overhead, writes, procs)
	int	overhead, writes, procs;
{
	int	p[100][2];
	int	msg = 0, i;
	int	time;
	int	sum;


	/*
	 * Get a bunch of pipes.
	 */
     	for (i = 0; i < procs; ++i) {
		if (pipe(p[i]) == -1) {
			perror("pipe");
			exit(1);
		}
	}

	/*
	 * Use the pipes as a ring, and fork off a bunch of processes
	 * to pass the byte through their part of the ring.
	 */
	signal(SIGTERM, SIG_IGN);
     	for (i = 1; i < procs; ++i) {
		switch (pids[i] = fork()) {
		    case -1: 
			perror("fork");
			killem(procs);

		    case 0:	/* child */
#ifdef	VIRTUAL_ADDR_CACHE
			/*
			 * This is a hack to put the per process data
			 * at different virtual addresses.  It was
			 * added to see if it makes a difference on
			 * ROSS HyperSPARC modules that have a VAC.
			 * It does.
			 */
			((char *) data) += process_size * i;
#endif
			doit(p, i-1, i);
			/* NOTREACHED */

		    default:	/* parent */
		    	;
	    	}
	}

	/*
	 * Go once around the loop to make sure that everyone is ready and
	 * to get the token in the pipeline.
	 */
	if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg) ||
	    read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg) ||
	    write(p[0][1], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("write/read/write on pipe");
		exit(1);
	}
	bzero(data, process_size);	/* make sure we have our own copy */

	/*
	 * Main process - all others should be ready to roll, time the
	 * loop.
	 */
	start();
	for (i = writes / procs; i--; ) {
		if (read(p[procs-1][0], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
		sum = sumit();
	    	if (write(p[0][1], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
	}
	time = stop();

	/*
	 * Close the pipes and kill the children.
	 */
     	killem(procs);
     	for (i = 0; i < procs; ++i) {
		close(p[i][0]);
		close(p[i][1]);
		if (i > 0) {
			wait(0);
		}
	}

	/*
	 * We know the overhead cost of each pipe trip, but we did it
	 * write times.
	 */
	return ((time / writes) - overhead);
}

killem(procs)
	int	procs;
{
	int	i;

	for (i = 1; i < procs; ++i) {
		if (pids[i] > 0) {
			kill(pids[i], SIGTERM);
		}
	}
}

void
doit(p, rd, wr)
	int	p[100][2];
	int	rd, wr;
{
	int	msg, sum;

	signal(SIGTERM, SIG_DFL);
	bzero(data, process_size);	/* make sure we have our own copy */
	for ( ;; ) {
		if (read(p[rd][0], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
		sum = sumit();
		if (write(p[wr][1], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
	}
}

/*
 * Run the overhead test several times, taking the smallest overhead to
 * be valid.  The reasoning is that larger overheads probably included
 * the cost of some other system activity.
 *
 * The cost returned is the cost of going through one pipe once in usecs.
 */
int
pipe_cost()
{
	int	i, min = 0x7fffffff;

	for (i = 0; i < TRIES; ++i) {
		int	j = overhead();

		if (min > j) {
			min = j;
		}

		/*
		 * This seems to disturb the caches just
		 * enough that I get consistent overhead numbers.
		 * Weird, I know.
		 */
		j = caches();
	}
	return (min);
}

/*
 * Calculate the cost of passing a byte through a pipe.  I do it with a
 * bunch of pipes to try and burn through the onboard caches.  Note that
 * on a Sun SPARC ss2, it made little difference if the loop was over one
 * or thirty pipes.
 */
int
overhead()
{
	int	p[100][2];
	int	msg = 0, sum, i, n, k;

	/*
	 * Get a bunch of pipes.
	 */
	n = 0;
	while (n < 20 && pipe(p[n]) != -1)
		n++;

	/*
	 * Measure the overhead of passing a byte around the ring.
	 */
	if (write(p[k = 0][1], &msg, sizeof(msg)) != sizeof(msg)) {
		perror("read/write on pipe");
		exit(1);
	}
	start();
	for (i = 0; i < WRITES; ++i) {
		if (read(p[k][0], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
		if (k == n) {
			k = 0;
		}
		sum = sumit();
		if (write(p[k][1], &msg, sizeof(msg)) != sizeof(msg)) {
			perror("read/write on pipe");
			exit(1);
		}
	}
	k = stop();
	for (i = 0; i < n; ++i) {
		close(p[i][0]);
		close(p[i][1]);
	}
	return (k / WRITES);
}

int
sumit()
{
	int	i, sum = 0;
	int	*d = data;

#define	TEN	sum+=d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7]+d[8]+d[9];d+=10;
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
#define	HALFK	HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED TEN sum+=*d++;sum+=*d++;

	for (i = process_size/sizeof(int); i > 512; i -= 512) {
		HALFK
	}
	return (sum);
}

#define	SIZE	(1024 * 1024)

/*
 * This is a huge unrolled loop that is supposed to blow the instruction
 * and the data caches in an attempt to get more reproducible numbers.
 * It sort of works.
 */
int
caches()
{
	int	i, sum = 0;
	char	*d = (char *)malloc(SIZE);
	char	*save = d;

	bzero(d, SIZE);

#define	TEN	sum+=d[0]+d[1]+d[2]+d[3]+d[4]+d[5]+d[6]+d[7]+d[8]+d[9];d+=10;
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
#define	HALFK	HUNDRED HUNDRED HUNDRED HUNDRED HUNDRED TEN sum+=*d++;sum+=*d++;
#define	KILO	HALFK HALFK

	for (i = SIZE; i > 8192; i -= 8192) {
		KILO KILO KILO KILO
		KILO KILO KILO KILO
	}
	free(save);
	return (sum);
}
