#!/usr/local/bin/perl -w

# $Id: nightly_webserver,v 1.9 2000/03/15 00:26:46 howland Exp $

# This script does 7 things:
#
# 1 cleans up files in /tmp
# 2 cycles /var/adm/pacct ( ckpacct )
# 3 on the first of every month, it creates summaries ( monacct )
# 4 forces the access log to be cycled, and any other web log larger
#   than half a megabyte.
# 5 runs webalizer
# 6 produces a report of new referals
# 7 archives the log files
#
########################################################################
# Copyright (c) 2000 Tom Howland
#
# You may distribute copies or derivations of this file under the terms
# of either the GNU General Public License or the Artistic License, as
# specified in the Perl README file.
#
# This code is provided with no warranty of any kind,and is used
# entirely at your own risk. This code was written by the author as a
# private individual, and is in no way endorsed or warrantied.
#
# Support questions and suggestions can be directed to tom@rahul.net
# Download from http://www.rahul.net/tom/WebMonitor.html
#
########################################################################

$ENV{'PATH'} .= ':/usr/lib/acct:/usr/local/bin:/opt/apache/bin';

open STDOUT, ">&STDERR";

$| = 1;

########################################################################
# delete files that haven't been looked at for 7 days or more from /tmp

system '( find /tmp ! -atime -7 | xargs rm -f ) > /dev/null 2>&1';

########################################################################
# possibly cycle /var/adm/pacct

system 'ckpacct', '1000';

########################################################################
# bump the log files and run webalizer
#
# we save the logs in old/YYYYMM/DD.<LOG>.bz2, but first we run webalizer
# on them before we move and compress them.

# get the uid & gid for the user 'edgarscan'

{
  my @g = getpwnam('edgarscan') or die "edgarscan not in passwd file";
  $e_uid = $g[2];
  $e_gid = $g[3];
}

chdir '/opt/apache/logs' || die $!;

{
  @lt = localtime(time);
  chdir 'old';
  my $dir = sprintf('%d%.2d', $lt[5] + 1900, $lt[4] + 1);
  unless(-d $dir){
    mkdir $dir, 0775 || die $!;
    chown $e_uid, $e_gid, $dir || die $!;

    # clean up year-old directories

    my $odir = $dir - 100;
    system "find $odir ! -type d -print | egrep -v access | xargs rm -f" if -d $odir;

    # make process usage summaries

    system 'monacct';

  }
  chdir '..';
  $day = 'old/' . $dir . sprintf('/%.2d.', $lt[3]);
}

# bz2 will be a list of files to compress, ultimately

$bz2 = '';

# maybe archive a log

sub bump($$){
  my $unconditional = shift;
  my $x = shift;
  my $b = $day . $x;
  my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime,$blksize,$blocks) = stat($x);

  # don't archive it unless its size is greater than half a megabyte or
  # we were told to unconditionally save it (first parameter is one).

  if(($unconditional || $size > 1 << 19) && !(-e $b . '.bz2' || -e $b)){
    my $webalizer = $x . '.webalizer';
    rename $x, $webalizer;
    open T, ">>$x"; close T; chown $uid, $gid, $x; chmod $mode, $x;
    push @rn, $webalizer, $b;
    $bz2 .= ' ' . $b;
  }
}

$access = 'access_log';

bump(1, $access);
bump(0, 'big.log');
bump(0, 'error_log');
bump(0, 'jserv.log');
bump(0, 'mod_jserv.log');

$bz2 eq '' && exit;

# now force Apache to start new logs by sending it a HUP

sub hup($){
  open P, shift || die $!;
  chop($pid = <P>);
  close P;
  kill HUP => $pid;
}

hup('httpd.pid');

# the monitoring program `big' needs a HUP too for the same reasons

hup('big.pid');

$w = $access . '.webalizer';

# run webalizer before we archive them, filtering out any useless messages.

open P, 'exec webalizer < ' . $w . ' 2>&1 |' || die "Couldn't open webalizer output: $!";
while(<P>){
  /Skipping (bad|oversized log) record|Truncating oversized/ || print;
}
close P;

# We are going to print out the new urls that are referring to us, but
# first we want to establish that it really is pointing to us and isn't
# just the url that the user happened to be looking at when he followed
# a bookmark to us or typed in our url. So we save the url on disk but
# we scan the contents of the url for occurances of the string
# 'edgarscan' before we print it out.

$sn = 'seen';

unless(-e $sn){
  open SEEN, ">$sn";
  close SEEN;
}

sub headster($){
  $_ = shift;
  /http:\/\/([^\/]+)(\/|$)/i || return undef;
  $_ = $1;
  s/\.\//\//g;
  y/A-Z/a-z/;
  return $_;
}

open SEEN, "+<$sn";
while(<SEEN>){
  chop;
  my $d = headster($_);
  defined($d) || next;
  $seen{$d} = 0;
}

seek SEEN, 0, 2;
$t = tell(SEEN);
$n = 0;
$stamp = sprintf '%d/%.2d/%.2d ', $lt[5] + 1900, $lt[4] + 1, $lt[3];
open ACCESS, $w || die $!;
while(<ACCESS>){
  /^\S+ \S+ \S+ \[[^\]]+\] \"[^\"]+\" \S+ \S+ \"(.+)\" \"([^\"]+)\"$/ || next;
  (my $h = $1) =~ /^http:\/\/.+/i || next;
  my $d = headster($h);
  defined($d) || next;
  unless(defined($seen{$d})){
    $seen{$d} = 0;
    print SEEN $stamp, $h, "\n";
    $n++;
  }
}
close ACCESS;

%seen = ();

if($n){

  my $has_it;

  require LWP::UserAgent;

  sub callback($$$) {
    my($data, $response, $protocol) = @_;
    if($data =~ /edgarscan/i){
      $has_it = 1;
      die 'has it';
    }
  }

  sub has_edgarscan($){
    $has_it = 0;
    my $request = new HTTP::Request GET => shift || die $!;

    # Pass request to the user agent and get a response back
    my $user_agent = new LWP::UserAgent;
    $user_agent->request($request, \&callback, 4096) || die $!;
    return $has_it;
  }

  seek SEEN, $t, 0;
  my @has_ed = ();
  while(<SEEN>){
    my $h = substr($_, 11);
    has_edgarscan($h) && push @has_ed, $h;
  }
  if(@has_ed){
    print $#has_ed + 1, ' new referal';
    print 's' if($#has_ed);
    print ":\n\n";
    do {
      print shift @has_ed;
    } while @has_ed;
    print '
for more on who is pointing to us, please visit

  http://sequoia.tc.pw.com/EdgarScan/es/internal_doc/referrers/

and

  http://edgarscan.pwcglobal.com/usage/
';
  }
}

close SEEN;

# now archive the logs

{
  while(@rn){
    my $source = shift @rn;
    my $target = shift @rn;
    rename $source, $target;
    chown $e_uid, $e_gid, $target;
    chmod 0664, $target;
  }
}

exec "bzip2$bz2";
