/*** analog 1.2 ***/
/* Please read Readme.html, or http://www.statslab.cam.ac.uk/~sret1/analog/  */

/*** analog.c; the main function ***/

#include "analhead2.h"

int main(int argc, char **argv)
{
  extern void urltodir();                   /* in alias.c */
  extern flag doaliashost();                /* in alias.c */
  extern flag doaliasurl();                 /* in alias.c */
  extern flag wantfile();                   /* in alias.c */
  extern flag wanthost();                   /* in alias.c */
  extern void reqhashadd();                 /* in hash.c */
  extern void domhashadd();                 /* in hash.c */
  extern void dirhashadd();                 /* in hash.c */
  extern void hosthashadd();                /* in hash.c */
  extern void approxhosthashadd();          /* in hash.c */
  extern struct url *reqsort();             /* in hash.c */
  extern struct dir *dirsort();             /* in hash.c */
  extern struct host *hostsort();           /* in hash.c */
  extern int domsort();                     /* in hash.c */
  extern void subdomsort();                 /* in hash.c */
  extern void addmonthlydata();             /* in hash.c */
  extern void adddailydata();               /* in hash.c */
  extern void addweeklydata();              /* in hash.c */
  extern void initialise();                 /* in init.c */
  extern void printvbles();                 /* in init.c */
  extern void output();                     /* in output.c */
  extern int sscanf_common();               /* in sscanf.c */
  extern int sscanf_ncsaold();              /* in sscanf.c */
  extern int strtomonth();                  /* in utilities.c */
  extern int dayofdate();                   /* in utilities.c */
  extern int minsbetween();                 /* in utilities.c */
  extern long timecode();                   /* in utilities.c */
  extern struct timestruct startofweek();   /* in utilities.c */
  extern void *xmalloc();                   /* in utilities.c */

  extern char commandname[];    /* all global vars declared in init.c */
  extern char logfile[];
  extern FILE *lf;
  extern flag vblesonly, bq, maskq, q7;
  extern flag mq, hq, dq, Dq, Wq, sq, Sq, oq, iq, rq;
  extern int domsortby, reqsortby, dirsortby;
  extern int domfloor, reqfloor, dirfloor, hostfloor;
  extern struct timestruct firsttime, lasttime, fromtime, totime, oldtime;
  extern int total_succ_reqs, total_succ_reqs7, total_fail_reqs;
  extern int total_fail_reqs7, total_other_reqs, total_other_reqs7;
  extern int no_hosts, no_hosts7, no_new_hosts7;
  extern int no_urls, no_urls7;
  extern int corrupt_lines, other_lines;
  extern double total_bytes, total_bytes7;
  extern struct monthly *firstm, *lastm;
  extern struct weekly *firstw, *lastw;
  extern struct daily *firstd, *lastd;
  extern int dailyreq[], hourlyreq[];
  extern struct host *hosthead[];
  extern struct url *urlhead[];
  extern int rnumber, inumber, onumber, Snumber;
  extern int monthlength[];
  extern flag debug;

  char inputline[MAXLINELENGTH];  /* a particular input line */
  int linetype;          /* COMMON, NCSAOLD or CORRUPT */
  char hostn[MAXSTRINGLENGTH];
  int date, year, hr, min, monthno;
  long thistimecode;
  char month[4];
  int day;
  char filename[MAXSTRINGLENGTH];
  int code;
  double bytes;    /* long is not big enough; double has more sig. figs,
		      and copes with overflow automatically. */
  char bytestr[16];

  struct monthly *tempmp;
  struct daily *tempdp;
  struct weekly *tempwp;
  struct url *urlsorthead, *urlp, *urllastp, *urlnextp;
  struct dir *dirsorthead;
  struct host *hostp, *hostnextp, *hostlastp, *hostsorthead;
  int firstdom;             /* for sorting */

  flag wantthisone = TRUE;  /* whether we want to analyse a particular entry */

  flag last7q;       /* are we now in the last 7 days? */

  int onlist;       /* which list we are on, while sorting etc. */

  int i, j;

  /*** Initialisation ***/

  initialise(argc, argv);

  if (vblesonly)
    printvbles();    /* which also exits */

  /*** Now start scanning ***/

  if (strcmp(logfile, "stdin") == 0)   /* first check the logfile exists */
    lf = stdin;
  else {
    lf = fopen(logfile, "r");
    if (lf == NULL) {
      fprintf(stderr, "%s: Error: Failed to open log file %s: exiting\n",
	      commandname, logfile);
      exit(ERR);
    }
  }

  while(fgets(inputline, MAXLINELENGTH, lf) != NULL) {

    linetype = CORRUPT;   /* paranoia :) */

    if (sscanf_common(inputline, hostn, &date, month, &year, &hr, &min,
		      filename, &code, bytestr) == 9) {
      linetype = COMMON;
    }

    else if (sscanf_ncsaold(inputline, hostn, month, &date, &hr, &min,
			    &year, filename) == 7) {
      linetype = NCSAOLD;
      if (bq) {
	bq = OFF;
	fprintf(stderr,
	"%s: Warning: Logfile contains old-style lines with no bytes data.\n",
		commandname);
	if ((domsortby == BYBYTES && oq) || (reqsortby == BYBYTES && rq) ||
	    (dirsortby == BYBYTES && iq)) {
	  fprintf(stderr, "Sorting will be by requests, not bytes.\n");
	  if (domsortby == BYBYTES && oq) {
	    domsortby = BYREQUESTS;
	    domfloor = MIN_DOM_REQS;
	  }
	  if (reqsortby == BYBYTES && rq) {
	    reqsortby = BYREQUESTS;
	    reqfloor = MIN_URL_REQS;
	  }
	  if (dirsortby == BYBYTES && iq) {
	    dirsortby = BYREQUESTS;
	    dirfloor = MIN_DIR_REQS;
	  }
	}
      }
    }

    if (linetype != CORRUPT) {

      monthno = strtomonth(month);
      thistimecode = timecode(date, monthno, year, hr, min);

      if (maskq) {   /* then check if we want this entry */
	wantthisone = (thistimecode > fromtime.code &&
		       thistimecode < totime.code && wantfile(filename) &&
		       wanthost(hostn));
      }

      if (!wantthisone)
	++other_lines;

      else {

	/* Are we in the last 7 days? Check this every time in case */
	/* logfile is not in chronological order */

	if (q7) {
	  last7q = FALSE;
	  if (thistimecode > oldtime.code)
	    last7q = TRUE;
	}
	
	bytes = atof(bytestr);
	total_bytes += bytes;
	if (last7q)
	  total_bytes7 += bytes;
	
	if (code <= 299 || code == 304) {  /* successes */
	  
	  day = dayofdate(date, monthno, year);
	  
	  ++total_succ_reqs;
	  if (last7q)
	    ++total_succ_reqs7;

	  if (total_succ_reqs == 1) {  /* i.e. this is the first */
	    firsttime.date = date;
	    firsttime.monthno = monthno;
	    firsttime.year = year;
	    firsttime.hr = hr;
	    firsttime.min = min;
	    firsttime.code = thistimecode;
	    lasttime.date = date;
	    lasttime.monthno = monthno;
	    lasttime.year = year;
	    lasttime.hr = hr;
	    lasttime.min = min;
	    lasttime.code = thistimecode;
	    if (mq) {
	      firstm = (struct monthly *) xmalloc(sizeof(struct monthly));
	      for (j = 0; j < 12; j++)
		firstm -> reqs[j] = 0;
	      firstm -> next = NULL;
	      lastm = firstm;
	    }
	    if (Dq) {
	      firstd = (struct daily *) xmalloc(sizeof(struct daily));
	      for (j = 0; j < 31; j++)
		firstd -> reqs[j] = 0;
	      firstd -> next = NULL;
	      lastd = firstd;
	    }
	    if (Wq) {
	      firstw = (struct weekly *) xmalloc(sizeof(struct weekly));
	      firstw -> reqs = 0;
	      firstw -> start = startofweek(firsttime);
	      firstw -> next = NULL;
	      lastw = firstw;
	    }
	  }
	  
	  /* date cataloguing */

	  if (mq) {

	    if (year <= firsttime.year) {
                              /* then we might need a new lot of months */
	      for (i = firsttime.year - year; i > 0; i--) {
		tempmp = firstm;
		firstm = (struct monthly *) xmalloc(sizeof(struct monthly));
		firstm -> next = tempmp;
		for (j = 0; j < 12; j++)
		  firstm -> reqs[j] = 0;
	      }
	      firstm -> reqs[monthno]++;   /* 1 more for this month */
	    }

	    else if (year >= lasttime.year) {     /* similarly */
	      for (i = year - lasttime.year; i > 0; i--) {
		tempmp = lastm;
		lastm = (struct monthly *) xmalloc(sizeof(struct monthly));
		tempmp -> next = lastm;
		lastm -> next = NULL;
		for (j = 0; j < 12; j++)
		  lastm -> reqs[j] = 0;
	      }
	      lastm -> reqs[monthno]++;
	    }

	    else   /* NB we will never get here if logfile in chron. order */
	      addmonthlydata(year, monthno);

	  }   /* end if (mq) */

	  if (Dq) {

	    if (year * 12 + monthno <=
		firsttime.year * 12 + firsttime.monthno) {
                              /* then we might need a new lot of days */
	      for (i = (firsttime.year - year) * 12 +
		   firsttime.monthno - monthno; i > 0; i--) {
		tempdp = firstd;
		firstd = (struct daily *) xmalloc(sizeof(struct daily));
		firstd -> next = tempdp;
		for (j = 0; j < 31; j++)
		  firstd -> reqs[j] = 0;
	      }
	      firstd -> reqs[date - 1]++;   /* 1 more for this date */
	    }

	    else
	      if (year * 12 + monthno >=
		  lasttime.year * 12 + lasttime.monthno) {
	      for (i = (year - lasttime.year) * 12 - lasttime.monthno + monthno;
		   i > 0; i--) {
		tempdp = lastd;
		lastd = (struct daily *) xmalloc(sizeof(struct daily));
		tempdp -> next = lastd;
		lastd -> next = NULL;
		for (j = 0; j < 31; j++)
		  lastd -> reqs[j] = 0;
	      }
	      lastd -> reqs[date - 1]++;
	    }

	    else   /* NB we will never get here if logfile in chron. order */
	      adddailydata(year, monthno, date);

	  }   /* end if (mq) */

	  if (Wq) {
	    if (thistimecode < firstw -> start.code) {   /* new week needed */
	      while (thistimecode < firstw -> start.code) {
		tempwp = firstw;
		firstw = (struct weekly *) xmalloc(sizeof(struct weekly));
		firstw -> next = tempwp;
		firstw -> reqs = 0;
		firstw -> start = tempwp -> start;
		firstw -> start.date -= 7;
		if (firstw -> start.date <= 0) {
		  firstw -> start.monthno--;
		  if (firstw -> start.monthno == -1) {
		    firstw -> start.monthno = 11;
		    firstw -> start.year--;
		  }
		  firstw -> start.date = monthlength[firstw -> start.monthno] +
		    firstw -> start.date + (firstw -> start.monthno == 1 &&
					    firstw -> start.year % 4 == 0);
		}
		firstw -> start.code = timecode(firstw -> start.date,
						firstw -> start.monthno,
						firstw -> start.year, 0, 0);
	      }
	      firstw -> reqs++;
	    }

	    else if (thistimecode >= lastw -> start.code) {
	      while (minsbetween(lastw -> start.date, lastw -> start.monthno,
				 lastw -> start.year, 0, 0,  /* 10080m = 1w */
				 date, monthno, year, 0, 0) >= 10080) {
		tempwp = lastw;
		lastw = (struct weekly *) xmalloc(sizeof(struct weekly));
		tempwp -> next = lastw;
		lastw -> next = NULL;
		lastw -> reqs = 0;
		lastw -> start = tempwp -> start;
		lastw -> start.date += 7;
		if (lastw -> start.date > monthlength[lastw -> start.monthno] +
                (lastw -> start.monthno == 1 && lastw -> start.year % 4 == 0)) {
		  lastw -> start.date -= monthlength[lastw -> start.monthno] +
		  (lastw -> start.monthno == 1 && lastw -> start.year % 4 == 0);
		  lastw -> start.monthno++;
		  if (lastw -> start.monthno == 12) {
		    lastw -> start.monthno = 0;
		    lastw -> start.year++;
		  }
		}
		lastw -> start.code = timecode(lastw -> start.date,
					       lastw -> start.monthno,
					       lastw -> start.year, 0, 0);
	      }
	      lastw -> reqs++;
	    }

	    else  /* again, only used if logfile not chronological */
	      addweeklydata(year, monthno, date);

	  }   /* end if (Wq) */

	  ++dailyreq[day];  /* these are so little work, it's cheaper just to */
	  ++hourlyreq[hr];
                            /* do them rather than to check dq and hq */

	  if (thistimecode < firsttime.code) {
	    firsttime.date = date;
	    firsttime.monthno = monthno;
	    firsttime.year = year;
	    firsttime.hr = hr;
	    firsttime.min = min;
	    firsttime.code = thistimecode;
	  }

	  if (thistimecode > lasttime.code) {
	    lasttime.date = date;
	    lasttime.monthno = monthno;
	    lasttime.year = year;
	    lasttime.hr = hr;
	    lasttime.min = min;
	    lasttime.code = thistimecode;
	  }

	  /* Now for the request report. We ignore all filename conversions
	     and aliases until outside the loop; it is more efficient to do
	     them just once at the end. Also note that we want to construct
	     a request report even if (iq AND NOT rq) because it is more
	     efficient to convert filenames into directories once at the end
	     than every time. */
	  
	  if (rq || iq) {
	    reqhashadd(filename, 1, bytes, last7q, OFF);
	  }
	  
	  /* We leave the directory report until the end; it's easier to do
	     work once for each filename, not once for each request */
	  
	  /* Now for the hostname count. Just the same as above. This time,
	     however, we don't do one if the domain report is on and this
	     is off, because this takes up a lot of memory. */

	  if (sq == ON) {
	    hosthashadd(hostn, 1, bytes, last7q);
	  }

	  /* If there is no exact hostname count, do the domain report now */

	  else if (oq) {
	    if (!maskq)
	      doaliashost(hostn);  /* o/wise it's already been done */
	    domhashadd(hostn, 1, bytes);
	  }

	  if (sq == APPROX) {
	    if (!maskq && !oq)
	      doaliashost(hostn);
	    approxhosthashadd(hostn, last7q);
	  }
	  
	}    /* end if code <= 299 || code == 304 */
	
	else if (code >= 400) {
	  ++total_fail_reqs;
	  if (last7q)
	    ++total_fail_reqs7;
	}

	else {   /* code 300's (not 304): redirects */
	  ++total_other_reqs;
	  if (last7q)
	    ++total_other_reqs7;
	}

      }  /* end if want this one */
	
    }   /* end if linetype != CORRUPT */

    else {   /* line is corrupt */
      ++corrupt_lines;
      if (debug != 0)
	fprintf(stderr, "C: %s", inputline);
      if (strchr(inputline, '\n') == NULL) {
                                /* line corrupt by being too long; */
	fscanf(lf, "%*[^\n]");              /* read to end of line */
	if (debug != 0)
	  fprintf(stderr, "\n");
      }
    }
  
  }   /*** end of main loop (reading logfile) ***/

  fclose(lf);

  if (total_succ_reqs == 0) {
    mq = OFF;
    dq = OFF;
    hq = OFF;
    oq = OFF;
    iq = OFF;
    rq = OFF;
    q7 = OFF;
    Sq = OFF;
    Dq = OFF;
    Wq = OFF;
  }

  else {   /* there are things to report */

    if (total_succ_reqs7 + total_fail_reqs7 + total_other_reqs7 == 0)
      q7 = OFF;   /* just total_bytes no good in case (!bq) */

    /* Next run through the list of hosts, doing the aliasing as required */

    if (sq == ON && !maskq) {   /* if maskq, the aliasing has been done */
      onlist = 0;                          /* the list of hosts we are on */
      hostp = hosthead[0];                 /* starting at list 0 */
      hostlastp = hostp;
      for ( ; onlist < HOSTHASHSIZE; hostp = hostnextp) {
                                           /* run through hosts */
	if (hostp -> name == NULL)    {    /* then finished this list */
	  hostnextp = hosthead[++onlist];  /* so start the next list */
	  hostlastp = hostnextp;
	}
	else {                          /* a real host */
	  strcpy(hostn, hostp -> name);
	  if (doaliashost(hostn)) {     /* if there was an alias to do */
	    no_hosts--;                 /* this wasn't a real host */
	    if (hostp -> last7)
	      no_hosts7--;
	    if (hostp -> last7 && !(hostp -> pre7))
	      no_new_hosts7--;
	    hosthashadd(hostn, hostp -> reqs, hostp -> bytes,
			hostp -> last7);  /* account it in the right place */
	    /* and take this host out of the list */
	    hostnextp = hostp -> next;
	    if (hostp == hostlastp) {    /* we are at the head of the list */
	      hosthead[onlist] = hostp -> next;
	      hostlastp = hostnextp;   /* and we still are afterwards */
	    }
	    else {
	      hostlastp -> next = hostp -> next;
                                       /* and hostlastp is unchanged */
	    }
	  }
	  else {  /* real host, but no alias */
	    hostnextp = hostp -> next;
	    hostlastp = hostp;
	  }
	}   /* end 'else real host' */

      }   /* end for all hosts */

    }   /* end if (sq == ON && !maskq) */


    /* Now the domain report. This is now easy because all the hostnames
       are already aliased etc. */

    if (oq && sq == ON) {
      onlist = 0;                          /* the list of files we are on */
      hostp = hosthead[0];                 /* starting at list 0 */
      for ( ; onlist < HOSTHASHSIZE; hostp = hostnextp) {
                                           /* run through hosts */
	if (hostp -> name == NULL) {       /* then finished this list */
	  hostnextp = hosthead[++onlist];  /* so start the next list */
	}
	else {
	  strcpy(hostn, hostp -> name);
	  domhashadd(hostn, hostp -> reqs, hostp -> bytes);
	  hostnextp = hostp -> next;
	}
      }
    }


    /* Now for aliasing filenames. */

    if ((rq || iq) && !maskq) {
                         /* again, if maskq, the aliasing has been done */
      onlist = 0;                        /* the list of files we are on */
      urlp = urlhead[0];                 /* starting at list 0 */
      urllastp = urlp;
      for ( ; onlist < URLHASHSIZE; urlp = urlnextp) {
                                         /* run through files */
	if (urlp -> name == NULL)    {   /* then finished this list */
	  urlnextp = urlhead[++onlist];  /* so start the next list */
	  urllastp = urlnextp;
	}
	else {
	  strcpy(filename, urlp -> name);
	  if (!(urlp -> aliasdone) && doaliasurl(filename)) {
                                       /* if there was an alias to do */
	    no_urls--;                 /* this wasn't a real file */
	    if (urlp -> last7)
	      no_urls7--;
	    reqhashadd(filename, urlp -> reqs, urlp -> bytes,
			urlp -> last7, ON);  /* account it in the right place */
	    /* and take this file out of the list */
	    urlnextp = urlp -> next;
	    if (urlp == urllastp) {    /* we are at the head of the list */
	      urlhead[onlist] = urlp -> next;
	      urllastp = urlnextp;   /* and we still are afterwards */
	    }
	    else {
	      urllastp -> next = urlp -> next; /* and urllastp is unchanged */
	    }
	  }
	  else {  /* real filename, but no alias */
	    urlnextp = urlp -> next;
	    urllastp = urlp;
	  }
	}   /* end 'else real host' */

      }   /* end for all URLs */

    }   /* end if (rq && !maskq) */


    /* Now the directory report. */

    if (iq) {
      onlist = 0;                        /* the list of files we are on */
      urlp = urlhead[0];                 /* starting at list 0 */
      for ( ; onlist < URLHASHSIZE; urlp = urlnextp) {
                                         /* run through files */
	if (urlp -> name == NULL) {      /* then finished this list */
	  urlnextp = urlhead[++onlist];  /* so start the next list */
	}
	else {
	  strcpy(filename, urlp -> name);
	  urltodir(filename);
	  dirhashadd(filename, urlp -> reqs, urlp -> bytes);
	  urlnextp = urlp -> next;
	}
      }
    }
	  

    /* now for the checking and sorting */
    
    if (rq) {
      urlsorthead = reqsort();
      if (rnumber == 0)
	rq = OFF;
      else if (rnumber < -reqfloor)
                             /* i.e. we want top n but there are only m < n */
	reqfloor = 0;
    }

    if (iq) {
      dirsorthead = dirsort();
      if (inumber == 0)
	iq = OFF;
      else if (inumber < -dirfloor)
	dirfloor = 0;
    }

    if (Sq) {
      hostsorthead = hostsort();
      if (Snumber == 0)
	Sq = OFF;
      else if (Snumber < -hostfloor)
	hostfloor = 0;
    }

    if (oq) {
      firstdom = domsort();
      if (onumber == 0)
	oq = OFF;
      else if (onumber < -domfloor)
	domfloor = 1;
      if (oq)
	subdomsort();
    }

  }    /* end else (there are things to report */

  /*** Finally, do all the output ***/

  output(urlsorthead, dirsorthead, hostsorthead, firstdom);

  return(OK);
   
}
