#!/usr/local/bin/perl -w
#
# $Id: big,v 1.4 2000/02/23 23:53:19 howland Exp $
#
# this script does two things.
#
#   1  It consolidates all the logs on sundew into one big log.
#
#   2  It restarts apache when it recognizes a scary error message.
#
# since it is running all the time, the nightly_webserver script sends
# it a HUP each night to archive its logs just like the web server.
#
# It must be run as root. Be root, then issue the commands
#
#	cd /opt/apache/logs
#	kill $(<big.pid) && nohup /export/home/edgarscan/bin/big >& /dev/null &
#
########################################################################
#
# Copyright (c) 2000 Tom Howland
#
# You may distribute copies or derivations of this file under the terms
# of either the GNU General Public License or the Artistic License, as
# specified in the Perl README file.
#
# This code is provided with no warranty of any kind,and is used
# entirely at your own risk. This code was written by the author as a
# private individual, and is in no way endorsed or warrantied.
#
# Support questions and suggestions can be directed to tom@rahul.net
# Download from http://www.rahul.net/tom/WebMonitor.html
#
########################################################################

# $scary is the regular expression that causes a restart of Apache

$scary = '^e: .*(Too many open files: couldn\'t spawn child process|java.lang.OutOfMemoryError|Unable to make connection to database|java.sql.SQLException: Broken pipe|Unable to execute query pwtc.edgar|java.sql.SQLException: ORA-01000: maximum open cursors exceeded)';

chdir '/opt/apache/logs';

# $tailn is what generates the log stream we examine.

$tailn = '/export/home/edgarscan/bin/tailn a access_log e error_log j jserv.log m mod_jserv.log';

$out = 'big';  # $out.log is the name of the log file, $out.pid is the
	       # name of the process id file.

$interval = 9; # seconds we sleep at the start, after a restart, and
	       # after a hup. By `sleep' I mean we ignore the output
	       # from Apache. By `ignore' I mean we do not scan it for
	       # restart trigggers. We do this to avoid loops. There are
	       # no omisions from $out.log.

$history = 99; # the number of lines of history we maintain for the mail
	       # message.

########################################################################

# make a record of our process identification
open OUT, '>' . $out . '.pid';
print OUT $$, "\n";
close OUT;

# launch our tailn command
open I, $tailn . ' | ' || die "couldn't tailn: $!";

# ignore the output for the first few seconds

sub launch_countdown(){
  $countdown = time + $interval;
}

$cursor = -1;

sub maybe_print(){
  $_ = <I>;
  defined($_) || die "the following command isn't working\n\n$tailn\n\n";
  /^a:.*\.gif|Apache JServ Module was cleaned-up/ && return 0;
  $cursor = ($cursor + 1) % $history;
  $told[$cursor] = tell(OUT);
  print;
  return 1;
}

$log = $out . '.log';

sub hup(){
  open OUT, '+>>' . $log;
  open STDERR, '>&OUT'; select STDERR; $| = 1;
  select OUT; $| = 1;
  seek OUT, 0, 2;
  if(0 == tell(OUT)){
    print '? This log file is a filtered version of the result of the
? command
?
? ' . $tailn . '
?
? and is generated by the script ~edgarscan/bin/big, which uses it for
? monitoring and proactively restarting Apache.
?
';
  }
  launch_countdown;
  maybe_print while time < $countdown;
}

hup;

$SIG{HUP} = \&hup;

# here comes the main loop

while(1){
  maybe_print || next;

  # see if we need to restart it.

  if(/$scary/){

    system '/opt/apache/bin/apachectl graceful';

    launch_countdown;

    open(MAIL, "|mailx -s 'Apache on sundew restarted' edgarscan_bug");

    my $tp1 = $#told + 1;

    print MAIL 'The scary line

', $_, '

was encountered ', $tp1, ' lines into the following log

------------------------------------------------------------------------

';

    # dump the last n lines

    my $top = 0;
    $top = ($cursor + 1) % $history if $history == $tp1;
    seek OUT, $told[$top], 0;
    my $i;

    sub print_please($$){
      my $i = shift;
      my $l = shift;
      $i++;
      if($i < 100){
	print MAIL sprintf('%.3d', $i);
      } else {
	print MAIL $i;
      }
      print MAIL ' ', $l;
    }

    for($i = 0; $i < $tp1; $i++){
      my $l = <OUT>;
      print_please($i, $l);
    }

    # ignore all output until it has restarted.

    while(time < $countdown){
      maybe_print && print_please($i++, $_);
    }

    print MAIL '... see ', $log, ' for more.

------------------------------------------------------------------------

please check EdgarScan at your earliest convenience. Visit
http://edgarscan.pwcglobal.com/ and visit a few sub pages. Possible
remedial actions include

  restarting tunneling -- this will be necessary if there have been any
  network resets.

    on sequoia, be root

    cd /etc/init.d
    ./ssh_tunnel stop
    ./ssh_tunnel start

or

  restarting the edgarservers

    on sequoia, be root

    exec /home/howland/alien
    stop_e
    start_e

  restarting the web server. The script that is sending you this
  message, sundew:/export/home/edgarscan/bin/big, has already attempted
  a graceful restart. Maybe it is time for a graceless restart.

    on sundew, be root

    cd /
    . ./tom
    apachectl stop
    sleep ', $interval, '
    apachectl start

For more tips try visiting the troubleshooting guide at

http://sequoia.tc.pw.com/internal_doc/TroubleShooting.html

';

    close MAIL;
  }
}
