/*
* $Id: nanotimer.c,v 1.9 2003/02/28 21:18:04 rgb Exp $
*
* See copyright in copyright.h and the accompanying file COPYING
*
*/

/*
 *========================================================================
 * timing and utility sources.  tv_start and tv_stop are globals.
 *========================================================================
 */

#include "cpu_rate.h"

/*
 * The following to programs constitute a NONportable, but very accurate,
 * intel/amd timer for pentia and athlons (tsc flag in /proc/cpuinfo).
 * The init call basically extracts the CPU clock from /proc/cpuinfo
 * and converts it into nanoseconds/cpu clock cycle.  The second reads
 * the cpu cycle counter directly and converts it into nanoseconds.
 *
 * To avoid potential problems with loss of precision on the interval
 * subtraction, this program stores the register contents on the
 * first call and only counts the nanoseconds from the first call, not
 * since the last boot.
 *
 * It is quite fast (order of 40-50 nsec).
 */

void nanotimer_rdtsc_init(Nanotimer *new_nanotimer)
{

 int i,imax,l,numfields;
 char statbuf[K];
 char delim[2],*nextval;
 FILE *cpuinfo_fd;
 double loop_delta_1000,loop_delta_1000000,loop_ratio;

 snprintf(new_nanotimer->name,K,"cpu cycle counter nanotimer");
 if(verbose == V_TIMER){
   printf("#========================================================================\n");
   printf("# Initializing timer %s\n",new_nanotimer->name);
 }
 new_nanotimer->timer = nanotimer_rdtsc;

 /*
  * First we extract the number of nanoseconds per cycle (required!)
  * from /proc/cpuinfo.  AFAICT, this is the easiest/only way to get
  * it, although I don't know about its precision.  This is fairly
  * standard parsing code (ripped from xmlsysd).
  */
 delim[0] = ':';                /* separator */
 delim[1] = (char) NULL;        /* string terminator */
 cpuinfo_fd = fopen("/proc/cpuinfo","r");
 while(-1){

   /* Normal EOF causes break from while loop */
   if((fgets(statbuf,K,cpuinfo_fd) == NULL)) break;

   if(strncmp(statbuf,"cpu MHz",7) == 0) {
     nextval = strtok(statbuf,delim);       /* first field skip */
     nextval = strtok((char *)NULL,delim);  /* second field is it */
     nsec_per_cycle = 1000.0/atof(nextval);
     break;
   }
 }

 fclose(cpuinfo_fd);

 if(nsec_per_cycle == 0.0){
   fprintf(stderr,"Error: Cannot parse out the cpu MHz from /proc/cpuinfo.\n");
   fprintf(stderr,"Cannot use built-in CPU cycle counter as timer.\n");
   fprintf(stderr,"Try the -g flag to use gettimeofday instead.\n");
   exit(0);
 }

 /*
  * Now we set the key elements of the nanotimer struct and TEST its
  * call granularity.
  *
  * We subtract off the time base to ensure that times of
  * order seconds (and then some) have resolvable differences
  * in double precision.  This is therefore the initial call to
  * the cpu cycle timer to extract the initial time base and
  * MUST precede the use of the timer.  We do NOT CHECK to keep
  * the timer fast!
  */
 asm volatile("rdtsc" : "=a" (ax_first), "=d" (dx_first));
 count_first = dx_first;
 count_first = count_first<<32;
 count_first += ax_first;

 /*
  * To determine the granularity, we just start calling it in a loop until
  * we've accumlated a few zillion calls, and see how long they took on
  * average.  We also crudely time the loop itself, and make sure that
  * its timing SCALES WITH SIZE.
  */
 new_nanotimer->stop = 0.0;
 new_nanotimer->delta = 0.0;
 /*
  * Empty loop.  The printf(""); seems more reliable than anything
  * including a nanosleep etc. at ensuring that the test start on a
  * clock cycle break, yielding the most consistent results.
  */
 printf("");
 new_nanotimer->start = nanotimer_rdtsc();
 for(i=0;i<1000;i++){
 }
 new_nanotimer->stop = nanotimer_rdtsc();
 loop_delta_1000 = new_nanotimer->stop - new_nanotimer->start;
 if(verbose == V_TIMER) printf("# loop_delta_1000 = %f\n",loop_delta_1000);

 printf("");
 new_nanotimer->start = nanotimer_rdtsc();
 for(i=0;i<1000000;i++){
 }
 new_nanotimer->stop = nanotimer_rdtsc();
 loop_delta_1000000 = new_nanotimer->stop - new_nanotimer->start;
 if(verbose == V_TIMER) printf("# loop_delta_1000000 = %f\n",loop_delta_1000000);

 loop_ratio = loop_delta_1000000/loop_delta_1000;
 if(verbose == V_TIMER) printf("# loop_ratio = %f (should be near 1000)\n",loop_ratio);

 /*
  * It should get bigger.  I don't much care how MUCH bigger
  */
 if(loop_ratio < 10){
   fprintf(stderr,"Warning:  Empty loop measurement not scaling in nanotimer_rdtsc_init()\n");
   fprintf(stderr,"  The compiler may optimize away empty loops!\n");
 }
 
 new_nanotimer->stop = 0.0;
 new_nanotimer->delta = 0.0;
 /*
  * Full loop.  We start right after a clock tick, we hope, and
  * we subtract out the loop_delta from above.
  */
 printf("");
 new_nanotimer->start = nanotimer_rdtsc();
 for(i=0;i<1000000;i++){
   new_nanotimer->stop = nanotimer_rdtsc();
 }
 new_nanotimer->delta = new_nanotimer->stop - new_nanotimer->start - loop_delta_1000000;
 if(verbose == V_TIMER) printf("# nanotimer delta = %f for 1000000 iterations\n",new_nanotimer->delta);
 
 new_nanotimer->nsec_granularity = new_nanotimer->delta/1000000.0;

 if(verbose == V_TIMER){
   printf("# %s: granularity = %f nanoseconds\n",
      new_nanotimer->name,new_nanotimer->nsec_granularity);
 }

}

double nanotimer_rdtsc()
{
 unsigned long ax, dx;
 unsigned long long count;
 double nanotime;

 asm volatile("rdtsc" : "=a" (ax), "=d" (dx));
 count = dx;
 count = count<<32;
 count += ax;
 count -= count_first;
 nanotime = (double) count;
 nanotime *= nsec_per_cycle;
 /* printf("nanotime = %f\n",nanotime); */
 return(nanotime);
 
}

/*
 * This is a portable nanosecond timer.  It uses gettimeofday (wall clock
 * time) with the time of the first call subtracted off to keep intervals
 * from horribly overflowing the double with irrelevant numbers (causing
 * a loss of precision).  Note that my direct measurements show that
 * gettimeofday() itself takes about 2 usec to complete.
 */

void nanotimer_gttod_init(Nanotimer *new_nanotimer)
{

 int i;
 double loop_delta_1000,loop_delta_1000000,loop_ratio;

 snprintf(new_nanotimer->name,K,"gettimeofday nanotimer");
 if(verbose == V_TIMER){
   printf("#========================================================================\n");
   printf("# Initializing timer %s\n",new_nanotimer->name);
 }
 new_nanotimer->timer = nanotimer_gttod;

 /*
  * Now we set the key elements of the nanotimer struct and TEST its
  * call granularity.
  *
  * To determine the granularity, we just start calling it in a loop until
  * we've accumlated a few zillion calls, and see how long they took on
  * average.  We also crudely time the loop itself, and make sure that
  * its timing SCALES WITH SIZE.
  */
 new_nanotimer->stop = 0.0;
 new_nanotimer->delta = 0.0;
 /*
  * Empty loop.  The printf(""); seems more reliable than anything
  * including a nanosleep etc. at ensuring that the test start on a
  * clock cycle break, yielding the most consistent results.
  */
 printf("");
 new_nanotimer->start = nanotimer_gttod();
 for(i=0;i<1000;i++){
 }
 new_nanotimer->stop = nanotimer_gttod();
 loop_delta_1000 = new_nanotimer->stop - new_nanotimer->start;
 if(verbose == V_TIMER) printf("# loop_delta_1000 = %f\n",loop_delta_1000);

 printf("");
 new_nanotimer->start = nanotimer_gttod();
 for(i=0;i<1000000;i++){
 }
 new_nanotimer->stop = nanotimer_gttod();
 loop_delta_1000000 = new_nanotimer->stop - new_nanotimer->start;
 if(verbose == V_TIMER) printf("# loop_delta_1000000 = %f\n",loop_delta_1000000);

 loop_ratio = loop_delta_1000000/loop_delta_1000;
 if(verbose == V_TIMER) printf("# loop_ratio = %f (should be near 1000)\n",loop_ratio);

 /*
  * It should get bigger.  I don't much care how MUCH bigger
  */
 if(loop_ratio < 10){
   fprintf(stderr,"Warning:  Empty loop measurement not scaling in nanotimer_gttod_init()\n");
   fprintf(stderr,"  The compiler may optimize away empty loops!\n");
 }
 
 new_nanotimer->stop = 0.0;
 new_nanotimer->delta = 0.0;
 /*
  * Full loop.  We start right after a clock tick, we hope, and
  * we subtract out the loop_delta from above.
  */
 printf("");
 new_nanotimer->start = nanotimer_gttod();
 for(i=0;i<1000000;i++){
   new_nanotimer->stop = nanotimer_gttod();
 }
 new_nanotimer->delta = new_nanotimer->stop - new_nanotimer->start - loop_delta_1000000;
 if(verbose == V_TIMER) printf("# nanotimer delta = %f for 1000000 iterations\n",new_nanotimer->delta);
 
 new_nanotimer->nsec_granularity = new_nanotimer->delta/1000000.0;

 if(verbose == V_TIMER){
   printf("# %s: granularity = %f nanoseconds\n",
      new_nanotimer->name,new_nanotimer->nsec_granularity);
 }

}

double nanotimer_gttod()
{

 struct timeval tv_now;
 double nanotime;

 /*
  * This avoids potential precision problems by computing the starting
  * time as of the first call, and measuring all subsequent times
  * relative to this.  Gets rid of a LOT of seconds.
  */
 if((tv_first.tv_sec == 0) && (tv_first.tv_usec == 0)){
    gettimeofday(&tv_first, (struct timezone *) NULL);
 }
 
 gettimeofday(&tv_now, (struct timezone *) NULL);
 nanotime = (double)(tv_now.tv_sec - tv_first.tv_sec);
 nanotime += 1.0e-6*(double)(tv_now.tv_usec - tv_first.tv_usec);

 /* return nanoseconds */	  
 nanotime *= 1.e+9;
 return(nanotime);
 
}

