/*
 * bw_mem_cp.c - simple memory copy test
 *
 * Usage: bw_mem_cp size libc|unrolled aligned|unaligned
 *
 * Measures both unrolled (simplistic) and library (general) copy
 * times of aligned & unaligned data.  Aligned here means that the
 * source and destination are aligned to page boundries, not that
 * the pointers are word aligned.
 *
 * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
 * additional restriction that results may published only if
 * (1) the benchmark is unmodified, and
 * (2) the version in the sccsid below is included in the report.
 * Support for this development by Sun Microsystems is gratefully acknowledged.
 *
 * Neal Nuckolls pointed out that src/dst should not both be on page
 * boundries or they collide on the same directmapped cache line.  
 * Hence the aligned/unaligned stuff.
 * XXX - I'm not sure that the aligned/unaligned stuff works.  Neal said
 * that he had to rewack an old version of this to get it right.  Check
 * with him.
 */
char	*id = "$Id: bw_mem_cp.c,v 1.1 1994/11/18 08:49:48 lm Exp $\n";
#include	"timing.c"

#ifndef TYPE
#define TYPE    double
#endif
#define	SIZE	sizeof(TYPE)
#ifndef N
#define N	8
#endif

main(ac, av)
        char  **av;
{
        int     usecs, i, bytes;
        TYPE   *src, *dst;
	unsigned long tmp;
	double	mb;

	write(2, id, strlen(id));
	if (ac != 4) {
		fprintf(stderr,
		    "Usage: %s size libc|unrolled aligned|unaligned\n", av[0]);
		exit(1);
	}
	bytes = atoi(av[1]);
	if ((last(av[1]) == 'k') || (last(av[1]) == 'K'))
		bytes *= 1024;
	if ((last(av[1]) == 'm') || (last(av[1]) == 'M'))
		bytes *= (1024 * 1024);
        src = (TYPE *)malloc(bytes + 16384);
	dst = (TYPE *)malloc(bytes + 16384);
	if (!src || !dst) {
		perror("malloc");
		exit(1);
	}

	tmp = (unsigned long)src;
	tmp += 8192;
	tmp &= ~8191;
	src = (TYPE *)tmp;
	tmp = (unsigned long)dst;
	tmp += 8192;
	tmp &= ~8191;
	dst = (TYPE *)tmp;
	if (strcmp(av[3], "aligned") != 0) {
		tmp = (unsigned long)dst;
		tmp += 128;
		dst = (TYPE *)tmp;
	}
	bzero(src, bytes);	/* for Linux */
	bzero(dst, bytes);	/* for Linux */

	if (strcmp(av[2], "libc") == 0) {
		start();
		for (i = 0; i < N; ++i) {
			bcopy(src, dst, bytes);
		}
		usecs = stop(dst);
	} else {
		start();
		for (i = 0; i < N; ++i) {
			unrolled(src, dst, bytes);
		}
		usecs = stop(dst);
	}
	mb = bytes / (1024.*1024);
	usecs /= N;
	fprintf(stderr, "%.04f %.2f\n", mb, mb / (usecs / 1000000.0));
	exit(0);
}

/*
 * XXX - Neal said that this could be made faster if I did all the loads
 * then all the stores.  Think about this.  It doesn't hold true on a ss2.
 */
unrolled(src, dst, bytes)
	TYPE	*src, *dst;
	int	bytes;
{
#if 0
	TYPE	tmp[8];

	while (bytes >= 8 * SIZE) {
		tmp[0] = dst[0];
		tmp[1] = dst[1];
		tmp[2] = dst[2];
		tmp[3] = dst[3];
		tmp[4] = dst[4];
		tmp[5] = dst[5];
		tmp[6] = dst[6];
		tmp[7] = dst[7];
		src[0] = tmp[0];
		src[1] = tmp[1];
		src[2] = tmp[2];
		src[3] = tmp[3];
		src[4] = tmp[4];
		src[5] = tmp[5];
		src[6] = tmp[6];
		src[7] = tmp[7];
		bytes -= 8 * SIZE;
		dst += 8;
		src += 8;
	}
#else
	while (bytes >= 8 * SIZE) {
		*dst++ = *src++;	/* 1 */
		*dst++ = *src++;	/* 2 */
		*dst++ = *src++;	/* 3 */
		*dst++ = *src++;	/* 4 */
		*dst++ = *src++;	/* 5 */
		*dst++ = *src++;	/* 6 */
		*dst++ = *src++;	/* 7 */
		*dst++ = *src++;	/* 8 */
		bytes -= 8 * SIZE;
	}
#endif
}

last(s)
	char	*s;
{
	while (*s++)
		;
	return (s[-2]);
}
