/*
 * list.c - measure memory load latency
 *
 * usage: list size-in-MB stride [stride ...]
 *
 * XXX - not done, doesn't work, just copied from the read case.
 *
 * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
 * additional restriction that results may published only if
 * (1) the benchmark is unmodified, and
 * (2) the version in the sccsid below is included in the report.
 * Support for this development by Sun Microsystems is gratefully acknowledged.
 */
char	*id = "$Id: lat_mem_wr.c,v 1.1 1994/11/18 08:49:48 lm Exp $\n";

#define N       1000000
#define STRIDE  (512/sizeof(char *))
#define	TRIES	4
#define	LOWER	512

#include	"timing.c"

main(ac, av)
        char  **av;
{
        int     len;
	int	range;
	int	stride;
	int	i;
	float	clk, getclock();
        char   *addr;

	write(2, id, strlen(id));
        len = atoi(av[1]) * 1024 * 1024;
        addr = (char *)malloc(len);
	clk = getclock();
	printf("clk=%.2f\n", clk);

	if (av[2] == 0) {
		fprintf(stderr, "\"stride=%d\n", STRIDE);
		for (range = LOWER; range <= len; range = step(range)) {
			loads(addr, range, STRIDE, clk);
		}
	} else {
		for (i = 2; i < ac; ++i) {
			stride = atoi(av[i]);
			fprintf(stderr, "\"stride=%d\n", stride);
			for (range = LOWER; range <= len; range = step(range)) {
				loads(addr, range, stride, clk);
			}
			fprintf(stderr, "\n");
		}
	}
	exit(0);
}

loads(addr, range, stride, clk)
	char	*addr;
	int	stride;
	float	clk;
{
	register char **p, **tmp;
        int     i;
	int	tries = 0;
	int	result = 0x7fffffff;

        /*
	 * First create a list of pointers.
	 */
     	if (stride & (sizeof(char *) - 1)) {
		fprintf(stderr, "list: stride must be aligned.\n");
		return;
	}

	/*
	 * Set up a list of pointers where the contents of each pointer is
	 * the location of the next element in the list, like so:
	 * addr[i] -> addr[i + stride] -> addr[i * 2 + stride] ....
	 */
     	for (i = 0; i < range; i += stride) {
		char	*next;

		p = (char **)&addr[i];
		if (i + stride >= range) {
			next = &addr[0];
		} else {
			next = &addr[i + stride];
		}
		*p = next;
	}

	/*
	 * Now walk them and time it.
	 */
        for (tries = 0; tries < TRIES; ++tries) {
                /*
		 * We want to have an assignment so we dig out the value
		 * and reset it to itself.
		 * XXX - make sure that the compiler does not optimize this
		 * out.  It didn't w/ an old GCC on a sparc2.
		 */
#define	ONE	tmp = (char **)*p; *p = (char *)tmp; p = tmp;
#define	FIVE	ONE ONE ONE ONE ONE
#define	TEN	FIVE FIVE
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
		i = N - 1000;
		p = (char **)addr;
                start();
                while (i > 0) {
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			i -= 1000;
                }
		i = stop(p);
		if (i < result) {
			result = i;
		}
	}
	/*
	 * We want to get to nanoseconds / store.  We don't want to
	 * lose any precision in the process.  What we have is the
	 * milliseconds it took to do N statements, where N is 1 million,
	 * where a statement is a load, store, move, and we expect that
	 * each load took between 10 and 2000 nanoseconds.
	 *
	 * We want just the memory latency time, not including the
	 * time to execute the instructions.  We allow three clocks
	 * for the instructions themselves.  We know that the store
	 * would force a (cache) load anyway so we include that in
	 * the store time.
	 * So we need to subtract off 3 * N * clk nanoseconds.
	 *
	 * XXX - we do not account for loop overhead here.
	 */
     	i = (clk * N * 3) / 1000;	/* instruction time in usecs */
	result -= i;			/* time for the loads themselves */
	result *= 1000;			/* convert to nanoseconds */
	result /= N;				/* nanosecs per load */
	fprintf(stderr, "%.5f %d\n", range / (1024. * 1024), result);
}

step(k)
{
	if (k < 1024) {
		k = k * 2;
        } else if (k < 4*1024) {
		k += 1024;
        } else if (k < 32*1024) {
		k += 2048;
        } else if (k < 64*1024) {
		k += 4096;
        } else if (k < 128*1024) {
		k += 8192;
        } else if (k < 256*1024) {
		k += 16384;
        } else if (k < 512*1024) {
		k += 32*1024;
	} else {
		k += 512 * 1024;
	}
	return (k);
}

float
getclock()
{
	float	c;
	FILE	*f = popen("mhz -c", "r");

	fscanf(f, "%f", &c);
	return (c);
}
