#!/var/opt/STORtools/bin/perl
# @(#) @(#)mess_alert.pl    1.3
use Date::Parse;
use Date::Format;

# $DEBUG = "TRUE";
require  "subroutines.pm";
&st_globals();

$PROGNAME = "mess_alert";
$RELEASE6 = "5.7";

&proc_cli();
&make_port_list( $HBA );
&get_storage();

# Loop Offline Thresholds
$socal_offline_warn = 10;
$socal_offline_alert = 15;

# Loop CRC Thresholds
$sf_CRC_warn = 10;
$sf_CRC_alert = 15;

# SCSI Disk Warning Thresholds
$ssd_warn = 15;
$ssd_alert = 20;


#######  T300 WARNING THRESHOLDS  ########

# Reset occured
$p01_warn = 1;
$p01_alert = 5;

# Reset other than hard reset occured
$p02_warn = 1;
$p02_alert = 5;

# Disk error during reconstruct
$p03_warn = 1;
$p03_alert = 5;

# Initialization of RAID parity failed for this volume
$p04_warn = 1;
$p04_alert = 5;

# Reconstruct operation failed for this volume
$p05_warn = 1;
$p05_alert = 5;

# There was a hard error on this disk which prompted a disable operation on this disk
$p06_warn = 1;
$p06_alert = 5;

# Attempt to write this disk failed
$p07_warn = 1;
$p07_alert = 5;

# Attempt to open a newly plugged disk failed 
$p08_warn = 1;
$p08_alert = 5;

# Attempt to create system area on this disk failed 
$p09_warn = 1;
$p09_alert = 5;

# The system area of this disk is bad
$p10_warn = 1;
$p10_alert = 5;

# Attempt to bring newly plugged disk online failed
$p11_warn = 1;
$p11_alert = 5;

# Attempt to open this disk failed
$p12_warn = 1;
$p12_alert = 5;

# Attempt to verify the data in the system area failed
$p13_warn = 1;
$p13_alert = 5;

# SCSI Disk Error Occurred
$p14_warn = 1;
$p14_alert = 5;

# The Alternate Master has detected that the Master controller failed to send out heartbeats
$p15_warn = 1;
$p15_alert = 5;

# System has bypassed a disk drive on loop A
$p16_warn = 1;
$p16_alert = 5;

# System has bypassed a disk drive on loop B
$p17_warn = 1;
$p17_alert = 5;

# Fan 1 fault on power supply unit 
$p18_warn = 1;
$p18_alert = 5;

# Fan 2 fault on power supply unit 
$p19_warn = 1;
$p19_alert = 5;

# DC of a power supply unit not ok
$p20_warn = 1;
$p20_alert = 5;

# A power supply unit has been disabled
$p21_warn = 1;
$p21_alert = 5;

# A power supply unit is off
$p22_warn = 1;
$p22_alert = 5;

# System detects that a power supply unit has switched to battery
$p23_warn = 1;
$p23_alert = 5;

# System detects missing battery of a power supply unit
$p24_warn = 1;
$p24_alert = 5;

# System detects that a power supply unit has been switched off
$p25_warn = 1;
$p25_alert = 5;

# System is not able to poll the loop card for information
$p26_warn = 1;
$p26_alert = 5;

# System detects the loop card is disabled
$p27_warn = 1;
$p27_alert = 5;

# System disabled a controller
$p28_warn = 1;
$p28_alert = 5;

# Battery took too long to recharge
$p29_warn = 1;
$p29_alert = 5;

# Disk Reconstruct failed 
$p30_warn = 1;
$p30_alert = 5;

# Disk Disable failed
$p31_warn = 1;
$p31_alert = 5;
      
# Disk hot plug in installation failed
$p32_warn = 1;
$p32_alert = 5;

# System detects multiple bit error in controllers memory
$p33_warn = 1;
$p33_alert = 5;

# System detects multiple disk failure in a lun
$p34_warn = 1;
$p34_alert = 5;

# System detects a previously configured disk is inserted into the wrong position in a unit
$p35_warn = 1;
$p35_alert = 5;

# System detects when a read failed during recon stripe
$p36_warn = 1;
$p36_alert = 5;

# System detects a replace battery condition
$p37_warn = 1;
$p37_alert = 5;

# System detects a replace battery condition
$p38_warn = 1;
$p38_alert = 5;

# System detects a replace battery condition
$p39_warn = 1;
$p39_alert = 5;

open FILE, "$MESSAGE_FILE" or die "problem opening $MESSAGE_FILE";

$scan_interval = ( $minutes + ( $hours * 60 ) ) * 60; # in seconds
$start_date = time - $scan_interval;
$current_year = (localtime)[5] + 1900;
$current_month = (localtime)[4];

chomp( $os_version = `/usr/bin/uname -r` );

&gather_data(\*FILE);

&scan_data;

if (defined $alert || defined $warn) {
    $subject = ($alert) ? "${PRODUCT}:mess_alert:${STORAGE_DEVICE} ERROR message" : "${PRODUCT}:mess_alert:${STORAGE_DEVICE} WARNING message";
    $TMP_MAIL = "/tmp/messalert$$";
    open (MAIL, "> $TMP_MAIL") or die "problem opening pipe to sendmail";
    @message = &st_header( "log" );
    print MAIL @message; 
    if ( ( $t300_warn_cnt > 0 ) ||  ( $t300_error_cnt > 0 ) ) {
        print MAIL "\nT300 devices reporting problems:\n";
        foreach $unique_t300 ( @unique_hostnames ) {
            print MAIL "$unique_t300\n";;
        }
        print MAIL "\n";
    }
    if ($alert) { 
	print MAIL @alerts; 
    }
    if ($warn) { 
        print MAIL @warns; 
    }
    close MAIL;

    if(!mail_message($EMAIL_ADDRESSES, $subject,  $TMP_MAIL)) {
        printf("Warning:  Cannot complete mailing of errors to $EMAIL_ADDRESSES.\n");
    }
    `/usr/bin/rm $TMP_MAIL`;

    if ($DEBUG) {
        print @alerts, @warns;
    }
}
exit 0;
#####################################################################
# gather_data - parse the file looking for warning and error messages
#####################################################################
sub gather_data {
    my $fh = shift;
    my $line_date;
    while (<$fh>) {
        push (@filestuff, $_);
    }
    while ( $line = shift @filestuff  ) {
        ($line_date) = $line =~ /^(\w+\s+\d+\s+\d+:\d+:\d+)/;
        ($line_month) = $line =~ /^(\w+)\s+\d+\s+\d+:\d+:\d+/;
        # The message file does NOT contain a year.
        # set the year.
        $line_month = &get_numeric_month( $line_month );
        if ( $line_month <= $current_month ) {
            # both the message file month and the system month are in the
            # same year
            $line_year = $current_year;
        } else {
            # assume that the message was posted in the previous year
            # if 2 years old we don't care.
            $line_year = $current_year - 1;
        }
        $line_date = str2time("$line_date $line_year");

        next if ($line_date < $start_date);

        if ( ( $HBA eq "S" ) && ( $STORAGE ne "T3" ) ) {

            if ( $os_version eq $RELEASE6 ) {

                # Loop Offlines
                if (/(socal\d+:\s+port\s+\d+):.*OFFLINE$/) {
                    $SFOFFL{$1}++;
                    next;
                }

                # SSD Warnings
                if (/WARNING.*\((ssd\d+)\)/) {
                    $WARNSSD{$1}++;
                    next;
                }

                # Loop CRC, DMA and Timeout Warnings
                if (/WARNING.*\((sf\d+)\)/) {
                    $mysfwarn = $1;
                    $_ = <FILE>;
                    if (/CRC/) {
                        $SFCRCWARN{$mysfwarn}++;
                        next;
                    }
                    if (/Offline Timeout/) {
                        $SFOFFTOWARN{$mysfwarn}++;
                        next;
                    }
                    if (/INCOMPLETE DMA XFER/) {
                        $SFDMAWARN{$mysfwarn}++;
                        next;
	                }

                    # SF Resets
                    if (/(sf\d+):.*sf_reset/) {
                        $SFRESET{$mysfwarn}++;
                    }
                    # Target and Loop ELS Retry
                    if (/(sf\d+):\s+ELS\s+.*(target\s+0x[0-9a-f]+)\s+retrying/) {
                        $RETRYELS{"$mysfwarn $2"}++;
                        $RETRYSF{$mysfwarn}++;
                        next;
                    }

                    # Target and Loop ELS Time Outs 
                    if (/(sf\d+):\s+ELS\s+.*(target\s+0x[0-9a-f]+)\s+timed out/) {
                        $TOELS{"$mysfwarn $2"}++;
                        $SFTOELS{$2}++;
                        next;
                    }


                } 

                # Target Offlines
                if (/(sf\d+):\s+(target 0x[0-9a-f]+)\s+offline/) {
                    $DDOFFL{"$1 $2"}++;
                    next;
                }

	        } else {

                # Loop Offlines
                if (/(socal\d+:\s+port\s+\d+):.*OFFLINE$/) {
                    $SFOFFL{$1}++;
                    next;
                }

                # SSD Warnings
                if (/WARNING.*\((ssd\d+)\)/) {
                    $WARNSSD{$1}++;
                    next;
                }

                # Loop CRC, DMA and Timeout Warnings
                if (/WARNING.*\((sf\d+)\)/) {
                    $mysfwarn = $1;
                    $_ = <FILE>;
                    if (/CRC/) {
                        $SFCRCWARN{$mysfwarn}++;
                        next;
                    }
                    if (/Offline Timeout/) {
                        $SFOFFTOWARN{$mysfwarn}++;
                        next;
                    }
                    if (/INCOMPLETE DMA XFER/) {
                        $SFDMAWARN{$mysfwarn}++;
                        next;
	                }
                }

                # SF Resets
                if (/(sf\d+):.*sf_reset/) {
                    $SFRESET{$1}++;
                }

                # Target and Loop ELS Retry
                if (/(sf\d+):\s+ELS\s+.*(target\s+0x[0-9a-f]+)\s+retrying/) {
                    $RETRYELS{"$1 $2"}++;
                    $RETRYSF{$1}++;
                    next;
                }

                # Target and Loop ELS Time Outs 
                if (/(sf\d+):\s+ELS\s+.*(target\s+0x[0-9a-f]+)\s+timed out/) {
                    $TOELS{"$1 $2"}++;
                    $SFTOELS{$2}++;
                    next;
                }

                # Target Offlines
                if (/(sf\d+):\s+(target 0x[0-9a-f]+)\s+offline/) {
                    $DDOFFL{"$1 $2"}++;
                    next;
                }
            }

        } elsif ( ( $HBA eq "P" ) && ( $STORAGE ne "T3" ) ) {

            # require 2 versions  
            if ( $os_version eq "NOT_READY" ) {


                $lookahead = shift @filestuff;
                unshift (@filestuff, $lookahead);
			
                if (   $line =~ /WARNING/ || $line =~ /LIP reset occured/ || $lookahead =~ /LIP reset occured/ ) {
                    if ( $line =~ /\((ssd\d+)\)/ ) {
                       ($mywarn) = $1;
                       $WARNSSD{$mywarn}++;
                    } elsif ( $line =~ /ifp/ ) {
                        $nextline = shift @filestuff;
                        if ( $line =~ /LIP reset occured/ ) {
                            ($myoff) = $line =~ /(ifp\d+)/;
                            $SFOFFL{$myoff}++;
                            unshift (@filestuff, $nextline);
                        } elsif ($nextline =~ /LIP reset occured/) {       
                            ($myoff) = $line =~ /(ifp\d+)/;
                            $SFOFFL{$myoff}++;
                        } elsif ($nextline =~ /CRC/) {       
                            # count it against the ifp
                            $SFCRCWARN{$mysfwarn}++;    
                            my ($tmpt) = $line =~ /target=(0x[0-9a-f]+):/;
                            my $mytarg = "${mysfwarn} target ${tmpt}";
                            # count it against the ifp targ
                            $DDCRCWARN{$mytarg}++;      
                        } elsif ($nextline =~ /DMA Failure/) {
                            $SFDMAWARN{$mysfwarn}++;
                        } else {
                            unshift (@filestuff, $nextline);
			}
				
                    } elsif ($line =~ /offline/) {
                        ($mysf) = $line =~ /(ifp\d+)/;
                        ($mytarg) = $line =~ /(target 0x[0-9a-f]+)/;
                        ($myidx) = "$mysf $mytarg";
                        $DDOFFL{$myidx}++;
                    }
                }

            } else {

                $lookahead = shift @filestuff;
                unshift (@filestuff, $lookahead);
			
                if (   $line =~ /WARNING/ || $line =~ /LIP reset occured/ || $lookahead =~ /LIP reset occured/ ) {


                    if ( $line =~ /\((ssd\d+)\)/ ) {
                       ($mywarn) = $1;
                       $WARNSSD{$mywarn}++;
                    } elsif ( $line =~ /ifp/ ) {
                        $nextline = shift @filestuff;
                        if ( $line =~ /LIP reset occured/ ) {
                            ($myoff) = $line =~ /(ifp\d+)/;
                            $SFOFFL{$myoff}++;
                            unshift (@filestuff, $nextline);
                        } elsif ($nextline =~ /LIP reset occured/) {       
                            ($myoff) = $line =~ /(ifp\d+)/;
                            $SFOFFL{$myoff}++;
                        } elsif ($nextline =~ /CRC/) {       
                            # count it against the ifp
                            $SFCRCWARN{$mysfwarn}++;    
                            my ($tmpt) = $line =~ /target=(0x[0-9a-f]+):/;
                            my $mytarg = "${mysfwarn} target ${tmpt}";
                            # count it against the ifp targ
                            $DDCRCWARN{$mytarg}++;      
                        } elsif ($nextline =~ /DMA Failure/) {
                            $SFDMAWARN{$mysfwarn}++;
                        } else {
                            unshift (@filestuff, $nextline);
			}
                    } elsif ($line =~ /offline/) {
                        ($mysf) = $line =~ /(ifp\d+)/;
                        ($mytarg) = $line =~ /(target 0x[0-9a-f]+)/;
                        ($myidx) = "$mysf $mytarg";
                        $DDOFFL{$myidx}++;
                    }
                }

              }

	    } elsif ( $STORAGE eq "T3" ) {
                # Process T300 messages
                $hostname = &match_hostname( $line );
	        if ( ( $line =~ / W: / ) || ( $line =~ / E: / ) ) {
                    if (!grep { /$hostname/ } @unique_hostnames) {
                        push @unique_hostnames, $hostname;
                    }
                
	            if ($line =~ / W: /) {
	                $t300_warn .= $line;
                        $t300_warn_cnt++;
		    } elsif ($line =~ / E: /) {
		        $t300_error .= $line;
                        $t300_error_cnt++;
                    }
                }

	    }

    }
}

#######################################################################
# scan_data - traverse each hash testing if warning or alert limits
#             have been exceeded.
#######################################################################
sub scan_data {
    foreach $key (sort keys %SFCRCWARN) {
        if ($SFCRCWARN{$key} >= $sf_CRC_warn && $SFCRCWARN{$key} < $sf_CRC_alert) {
            $warn++;
            push @warns, sprintf "WARNING: $key has received $SFCRCWARN{$key} sf CRC WARNING messages\n";
        } elsif ($SFCRCWARN{$key} > $sf_CRC_alert) {
            $alert++;
            push @alerts, sprintf "ERROR: $key has received $SFCRCWARN{$key} sf CRC WARNING messages\n";
        }
    }
    foreach $key (sort keys %SFOFFL) {
        if ($SFOFFL{$key} >= $socal_offline_warn && $SFOFFL{$key} < $socal_offline_alert) {
            $warn++;
            if ( $HBA eq "S" ) {
                push @warns, sprintf "WARNING: $key has received $SFOFFL{$key} socal OFFLINE messages\n";
            } elsif ( $HBA eq "P" ) {
                push @warns, sprintf "WARNING: $key has received $SFOFFL{$key} ifp OFFLINE messages\n";
            }
        } elsif ($SFOFFL{$key} >= $socal_offline_alert) {
            $alert++;
            if ( $HBA eq "S" ) {
                push @alerts, sprintf "ERROR: $key has received $SFOFFL{$key} socal OFFLINE messages\n";
            } elsif ( $HBA eq "P" ) {
                push @alerts, sprintf "ERROR: $key has received $SFOFFL{$key} ifp OFFLINE messages\n";
            }
        }
    }
    foreach $key (sort keys %WARNSSD) {
        if ($WARNSSD{$key} >= $ssd_warn && $WARNSSD{$key} < $ssd_alert) {
            $warn++;
            push @warns, sprintf "WARNING: $key has received $WARNSSD{$key} WARNING messages\n";
        } elsif ($WARNSSD{$key} >= $ssd_alert) {
            $alert++;
            push @alerts, sprintf "ERROR: $key has received $WARNSSD{$key} WARNING messages\n";
        }
    }
    $dma_errors = 0;
    foreach $key (sort keys %SFDMAWARN) {
        if ($SFDMAWARN{$key} ) {
            $alert++;
            $dma_errors++;
            push @alerts, sprintf "ERROR: $key has received $SFDMAWARN{$key} sf DMA WARNING messages\n";
        }
    }
    if ( $dma_errors > 0 ) {
       push @alerts, sprintf "FATAL DMA ERRORS have been detected!\nIf this HBA has not already been replaced, it must be replaced immediately!\n";
    }
    #
    # Support for T300 "Warning" messages
    #
    if ( $t300_warn_cnt > 0 ) {
        $warn = 1;
        push @warns, sprintf "\nT300 Warning messages:\n";
        push @warns, sprintf "$t300_warn";
    }

    #
    # Support for T300 Error messages
    #
    if ( $t300_error_cnt > 0 ) {
        $alert = 1;
        push @alerts, sprintf "\nT300 Error messages:\n";
        push @alerts, sprintf "$t300_error";
    }
}

sub proc_cli {

    $HBA = "";
    $STORAGE = "";
    undef $minutes;
    undef $hours;

    @usage =
    (
    "USAGE: $PROGNAME -S | -P | -T3 | -mail email address(es) | -f <file> | -m mintes | -h hours\n",
    "S    - Select SBUS Host Adapters (default)\n",
    "P    - Select PCI Host Adapters\n",
    "T3   - T300 Series Storage array\n",
    "mail - mail messages to email address(es)\n",
    "f    - select a message file\n",
    "m    - minutes between message scan (default 24 hours)\n",
    "h    - hours between message scan (default 24 hours)\n",
    );

    if ($#ARGV<0) {
        die "\n$PROGNAME requires arguments.\n @usage\n";
    }

    $EMAIL_ADDRESSES = "root";
    $MESSAGE_FILE = "/var/adm/messages";
    $arg = shift @ARGV;
    while($arg) {
	    if ($arg =~ /help/)  {
            die @usage if ($arg =~ /help/);
        } elsif ($arg =~ /^-debug/) {
            $DEBUG = "TRUE";
        } elsif ($arg =~ /^-S/) {
            $HBA = "S";
            check_bus_option( $HBA );
        } elsif ($arg =~ /^-P/) {
            $HBA = "P";
            check_bus_option( $HBA );
        } elsif ($arg =~ /^-T3/i) {
            $STORAGE      = "T3";
        } elsif ( $arg =~ /^-mail/ ) {
            $EMAIL_ADDRESSES = shift @ARGV;
        } elsif ( $arg =~ /^-f/i ) {
            $message_file = shift @ARGV;
            ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,$atime,$mtime,$ctime
,$blksize,$blocks) = stat($message_file);
            if (-e $message_file && ($size > 0)) {
                $MESSAGE_FILE = $message_file;
            } else {
                print "$message_file does not exist or has zero length\n";
                exit 1;
            }
        } elsif ( $arg =~ /^-m/ ) {
            $minutes = shift @ARGV;
            if ( ( $minutes < 0 ) || ( $minutes > 60 ) ) {
               die "\n$minutes minutes must be between 0 and 60!\n @usage\n";
            }
        } elsif ( $arg =~ /^-h/ ) {
            $hours = shift @ARGV;
            if ( ( $hours < 0 ) || ( $hours > 24 ) ) {
               die "\n$hours hours must be between 0 and 24!\n @usage\n";
            }
        } else {
            die "\n$arg is an invalid option!\n @usage\n";
	    }
        # Get the next value.
        $arg = shift @ARGV;
    }

    if ( $HBA ne "P" && $HBA ne "S" ) {
        &get_fcal_hbas();
        &select_fcal_hbas();
    }

    print "HBA: $HBA\n" if ($DEBUG);

    if ( ( !defined $hours ) && ( !defined $minutes ) ) {
        $hours = 24;
    }

    if ( !defined $minutes ) {
        $minutes = 0;
    }
    if ( !defined $hours ) {
        $hours = 0;
    }
}

sub get_numeric_month {
    my $month = shift;
    if ( $month eq "Jan" ) { return 0 };
    if ( $month eq "Feb" ) { return 1 };
    if ( $month eq "Mar" ) { return 2 };
    if ( $month eq "Apr" ) { return 3 };
    if ( $month eq "May" ) { return 4 };
    if ( $month eq "Jun" ) { return 5 };
    if ( $month eq "Jul" ) { return 6 };
    if ( $month eq "Aug" ) { return 7 };
    if ( $month eq "Sep" ) { return 8 };
    if ( $month eq "Oct" ) { return 9 };
    if ( $month eq "Nov" ) { return 10 };
    if ( $month eq "Dec" ) { return 11 };
}
