#include <iostream.h>
#include <fstream.h>

#ifdef WIN32
#include <windows.h>
#else
#include <unistd.h>
#endif

#include <stdlib.h>
#include <timeit.h>

int main( void )
{
  const int ntimes = 10;
  const int size = 1024;
  int   data[ size ][ size ];

  timeobj *tt = timeit_new();

  int ii, jj, kk;

  /* create data sink - some compilers optimize away
     calculations that are performed but never used.
     this ofstream is created to dump the results
     somewhere (/dev/null) so they are 'used'.
   */
  ofstream dump;
  dump.open( "/dev/null" );

  /*
   * access array data by row
   * the results of this access should be cache-aligned
   * on some platforms, and should be reasonably fast.
   */

  timeit_start( tt );
  for( ii=0; ii<ntimes; ii++ )
    {
      for ( jj=0; jj<size; jj++ )
	{
	  for ( kk=0; kk<size; kk++ )
	    {
	      data[ jj ][ kk ] = rand();
	    }
	}
    }
  timeit_stop( tt );

  /* a faux use of the data (to fool optimizing compilers) */
  for( ii=0; ii<size; ii++ )
    {
      for( jj=0; jj<size; jj++ )
	{
	  dump << data[ii][jj];
	}
    }
  cerr << "row access (s):" << timeit_getf( tt, timeit_seconds ) << endl;

  /*
   * access data by column
   * the results of this access should be cache-misaligned
   * on some platforms, and should be painfully slow.
   */

  timeit_start( tt );
  for( ii=0; ii<ntimes; ii++ )
    {
      for ( jj=0; jj<size; jj++ )
	{
	  for ( kk=0; kk<size; kk++ )
	    {
	      data[ kk ][ jj ] = rand();
	    }
	}
    }
  timeit_stop( tt );

  /* a faux use of the data (to fool optimizing compilers) */
  for( ii=0; ii<size; ii++ )
    {
      for( ii=0; ii<size; ii++ )
	{
	  dump << data[ii][jj];
	}
    }
  cerr << "row access (s):" << timeit_getf( tt, timeit_seconds ) << endl;

  timeit_delete( tt );

  return( 0 );
}
