|
#!/usr/local/bin/perl
|
|
#
|
|
# Minimal monitor_disk_space
|
|
#
|
|
# Copyright 1992 - 1997 Stephen O. Lidie. All rights reserved.
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify\
|
|
it under
|
|
# the same terms as Perl itself.
|
|
#
|
|
# A TCP client/server to monitor disk space and alert operators when a\
|
|
machine
|
|
# is running low. Each monitored machine runs the server
|
|
# `monitor_disk_space_daemon' which transmits a `df' output when reque\
|
|
sted by
|
|
# this client. Results are summarized and displayed as a colored bar \
|
|
graph.
|
|
#
|
|
# Typically `monitor_disk_space' (monds) is used in a central computin\
|
|
g site
|
|
# where a small number of critical servers must remain operational.
|
|
#
|
|
# monds maintains a memory-resident database containing historical dis\
|
|
k space
|
|
# usage information as an associative array indexed by hostname, which\
|
|
is used
|
|
# to see how filesystem usage varies in time.
|
|
#
|
|
# Use monitor_disk_space -full_help for full help.
|
|
|
|
require 5.002; # need Perl at least at this lev\
|
|
el
|
|
use English; # legible variable names
|
|
use strict; # be a pure as possible
|
|
|
|
# A BEGIN constructor is executed as soon as it's defined, so we
|
|
# prepend the directory name that contains our own class modules and
|
|
# `require' files to Perl's @INC variable so they can be located.
|
|
#
|
|
# Multiple BEGINs are executed in order, so once Perl knows where to f\
|
|
ind
|
|
# our private code we can:
|
|
#
|
|
# - Do command line parsing first so we don't have to load all of
|
|
# X/Tk just for a -help request.
|
|
# - Create the Perl/Tk MainWindow immediately and do Tk stuff. Since\
|
|
monds
|
|
# takes some time to load, a status display consisting of two Label\
|
|
|
|
# widgets keeps us occupied by displaying how initialization is
|
|
# progressing. The first label is an unchanging string while the s\
|
|
econd
|
|
# dynamically displays our state.
|
|
sub update_status;
|
|
my $LIBDIR; # pathname of monds auxiliary files\
|
|
directory
|
|
my $MW; # Tk Main Window
|
|
my $FS; # master grid frame for filesys\
|
|
tems
|
|
my $STATUS; # label widgets for monitoring the \
|
|
monds ...
|
|
my $STATUS_L; # ... initialization status
|
|
my $VERSION; # monds version number
|
|
use vars qw(@PDT @MM %OPT); # command line information
|
|
|
|
BEGIN {
|
|
$LIBDIR = "./lib";
|
|
unshift @INC, $LIBDIR;
|
|
}
|
|
BEGIN {
|
|
use Getopt::EvaP; # Evaluate Parameters
|
|
require "inied.pl"; # initialize Evaluate Parameter\
|
|
s data
|
|
EvaP \@PDT, \@MM, \%OPT; # evaluate command line parameters
|
|
$Getopt::EvaP::evap_embed = 1; # make Evaluate Parameters embeddab\
|
|
le now
|
|
use Tk; # define Tk objects and methods
|
|
$VERSION = '0.1';
|
|
$MW = MainWindow->new;
|
|
$MW->positionfrom('user');
|
|
$MW->geometry('+100+100');
|
|
$MW->title("Monitor Disk Space $VERSION");
|
|
$MW->iconname('monds');
|
|
$MW->iconbitmap("\@$LIBDIR/monds.xbm");
|
|
|
|
$STATUS_L = $MW->Label(-text => 'Initializing', -fg => 'b\
|
|
lue');
|
|
$STATUS_L->grid(-row => 0, -column => 0, -sticky => 'w\
|
|
');
|
|
$STATUS = $MW->Label(qw(-width 40 -anchor w -foreground blue),
|
|
-text => "monds $VERSION ...");
|
|
$STATUS->grid(-row => 0, -column => 1, -sticky => 'w')\
|
|
;
|
|
$MW->update;
|
|
}
|
|
|
|
update_status 'Class Modules';
|
|
use Carp; # better traceback
|
|
use Tk::Dialog; # Dialog objects
|
|
use Tk::ErrorDialog; # errordialog object
|
|
use FileHandle; # handle methods
|
|
use Filesystem; # Filesystem objects
|
|
update_status 'Require Files';
|
|
require "ctime.pl"; # time conversion routines
|
|
use Socket; # socket.h defines
|
|
require "stat.pl"; # file status ordinals
|
|
$::ST_MTIME = $::ST_MTIME; # suppress -w message
|
|
|
|
update_status 'Forward Declarations';
|
|
use subs qw(abort analyze_df_data check_configuration_files
|
|
construct_main_widgets display_poll_results end_monds flas\
|
|
h_widget
|
|
heuristics_say_so initialize op_msg output poll_clients re\
|
|
ad_efcf
|
|
start_tcp_clients stop_tcp_clients write_log_file);
|
|
|
|
my %CHILD; # asynchronous process information
|
|
my @COLORS; # highlight different severity leve\
|
|
ls
|
|
my $DDA; # monds About dialog
|
|
my %EXCLUDE; # lists of directories not to moni\
|
|
tior
|
|
my @FS_ATTENTION_DATA; # list of output lines with aler\
|
|
t information
|
|
my %HOSTS; # lists of `df' samples for monitore\
|
|
d hosts
|
|
my $LAST_EF_MTIME; # last modification time of exclude_\
|
|
FS file
|
|
my $MAX_DF_SAMPLES; # keep this many `df's samples per \
|
|
host
|
|
my $OLD_TIME; # time of moon's last phase chang\
|
|
e
|
|
my @PATTERNS; # list of moon phase change bitma\
|
|
ps
|
|
my $PATTERN_MODULUS; # phase change counter
|
|
my $PORT; # TCP port used by monds client/serve\
|
|
r tasks
|
|
my $READ_BITS; # select() socket read bitmap ma\
|
|
sk
|
|
my %SEVERITY; # hash of filesystem severity lev\
|
|
el cutoffs
|
|
my $THROB; # label widget with throbbing moon
|
|
|
|
# Main.
|
|
|
|
initialize; # preset monitor_disk_space
|
|
$MW->repeat($OPT{poll_interval}, \&poll_clients); # asynchronous TC\
|
|
P/IP polling
|
|
MainLoop; # process Perl/Tk X events
|
|
|
|
sub abort {
|
|
|
|
# Special `die' so we zap kids and don't trash Tk data structures.
|
|
|
|
my($msg) = @ARG;
|
|
|
|
carp "monds: $msg failed: $OS_ERROR";
|
|
end_monds;
|
|
} # end abort
|
|
sub analyze_df_data {
|
|
# Analyze the returned `df' data now:
|
|
#
|
|
# - Maintain the list of running samples for later (simple) heuris\
|
|
tics.
|
|
# - If load-leveling is enabled for this host/filesystem and the
|
|
# percentage used exceeds the threshold then initiate load-level\
|
|
ing.
|
|
# - Classify which severity level the filesystem belongs in and up\
|
|
date the
|
|
# %alerts hash, if required.
|
|
my($fs_data_ref, $alerts_ref, $them) = @ARG;
|
|
# Append new `df' samples to end of filesystem list.
|
|
my(@sample_list) = split /%/, $HOSTS{$them} if defined $HOSTS{$the\
|
|
m};
|
|
my $sample_count = @sample_list;
|
|
shift @sample_list if $sample_count >= $MAX_DF_SAMPLES;
|
|
push @sample_list, join(':', @$fs_data_ref);
|
|
$HOSTS{$them} = join '%', @sample_list;
|
|
my($fs, $filesystem, $used, $llfs, $cushion, $cushion_fs, $severit\
|
|
y_level);
|
|
|
|
PROCESS_DF_SAMPLES:
|
|
foreach $fs (@$fs_data_ref) {
|
|
($filesystem, $used) = ($fs =~ /(.*)=(.*)/);
|
|
|
|
# If possible, categorize this filesystem's severity level and\
|
|
add to
|
|
# the %alerts hash. Remember, the value of an %alerts hash en\
|
|
try is
|
|
# a reference to a list.
|
|
|
|
DETERMINE_SEVERITY:
|
|
foreach $severity_level (reverse sort(keys %SEVERITY) ) {
|
|
if ($used >= $severity_level) {
|
|
push(@{$alerts_ref->{$severity_level}},
|
|
sprintf("%s %s %s %d%%\n",
|
|
substr(&ctime(time), 0, 19), $them, $filesystem, $us\
|
|
ed))
|
|
if heuristics_say_so $severity_level, $filesystem, $\
|
|
used,
|
|
$sample_count, @sample_list;
|
|
last DETERMINE_SEVERITY;
|
|
} # ifend
|
|
} # forend DETERMINE_SEVERITY
|
|
} # forend PROCESS_DF_SAMPLES
|
|
} # end analyze_df_data
|
|
sub check_configuration_files {
|
|
# Re-read any configuration files that may have been updated on-th\
|
|
e-fly.
|
|
my $mtime = (stat $OPT{exclude_filesystem_configuration_file})
|
|
[$::ST_MTIME];
|
|
read_efcf if $mtime != $LAST_EF_MTIME;
|
|
$LAST_EF_MTIME = $mtime;
|
|
|
|
} # end check_configuration_files
|
|
|
|
sub construct_main_widgets {
|
|
|
|
# Main window with pull-down menus, a flasher and bar graphs (dyna\
|
|
mically
|
|
# created/destroyed Filesystem objects). We have three grid maste\
|
|
rs: the
|
|
# menubar frame, the filesystems frame and the percentage frame.
|
|
|
|
update_status 'Main Widgets: menus';
|
|
$MW->option('add', '*highlightThickness' => 0);
|
|
my $mb = $MW->Frame(qw(-relief raised -borderwidth 1));
|
|
my $mbf = $mb->Menubutton(qw(-text File -relief raised -bd 1));
|
|
$mbf->command(-label => 'Close', -command => [$MW => '\
|
|
iconify']);
|
|
$mbf->separator;
|
|
$mbf->command(-label => 'Quit', -command => \&end_monds);
|
|
my $mbh = $mb->Menubutton(qw(-text Help -relief raised -bd 1));
|
|
$mbh->command(-label => 'About');
|
|
|
|
update_status 'Main Widgets: moon';
|
|
$THROB = $mb->Label(
|
|
-bitmap => "\@${LIBDIR}/$PATTERNS[$PATTERN_MODULUS % 2]",
|
|
);
|
|
$FS = $MW->Frame; # Filesystem grid master
|
|
my $percentage = $MW->Label(-relief => 'ridge', -text =>
|
|
' 90 92 \
|
|
' .
|
|
'94 96 98 100');
|
|
# Global dialogs.
|
|
update_status 'Main Widgets: dialogs';
|
|
$DDA = $MW->Dialog(-title => 'About monds');
|
|
$mbh->entryconfigure('About', -command => [$DDA => 'Show'\
|
|
]);
|
|
$DDA->configure(
|
|
-wraplength => '5i',
|
|
-text => "Monitor Disk Space $VERSION. 97/01/05\n\\
|
|
nExamine " .
|
|
"the log file and display bar graphs of filesys\
|
|
tems " .
|
|
"close to capacity. The percentage full is " .
|
|
"indicated by the scale at the bottom of the " \
|
|
.
|
|
"display and the color of the bar. The flashin\
|
|
g " .
|
|
"moon rotates 180 degrees everytime the log fil\
|
|
e " .
|
|
"is updated, typically once a minute.\n\nThe en\
|
|
try " .
|
|
"widgets containing textual information can be \
|
|
" .
|
|
"scrolled by holding down button 2 on the point\
|
|
ing " .
|
|
"device.\n\nSometimes important operator messag\
|
|
es " .
|
|
"could appear.\n\nYell for an analyst when you \
|
|
" .
|
|
"start seeing red.",
|
|
);
|
|
|
|
update_status 'Main Widgets: heartbeat';
|
|
|
|
flash_widget $THROB, -background, 'azure',
|
|
($THROB->configure(-background))[4], 1000;
|
|
if ($MW->depth > 1) {
|
|
my $pixmap = $MW->Pixmap('-file' => "$LIBDIR/monds.xpm")\
|
|
;
|
|
$MW->Icon(-image => $pixmap);
|
|
}
|
|
# Kill status widgets and realize the main monds display.
|
|
$STATUS_L->destroy;
|
|
$STATUS->destroy;
|
|
|
|
$mb->grid(-sticky => 'ew'); # menubar grid master
|
|
$mbf->grid(-row => 0, -column => 0);
|
|
$THROB->grid(-row => 0, -column => 1);
|
|
$mb->gridColumnconfigure(1, -weight => 1);
|
|
$mbh->grid(-row => 0, -column => 2);
|
|
$FS->grid; # filesystems grid master
|
|
$percentage->grid; # percentage grid master
|
|
|
|
} # end construct_main_widgets
|
|
|
|
sub display_poll_results {
|
|
|
|
# Parse the @FS_ATTENTION_DATA list, derived from the monds log fi\
|
|
le, and
|
|
# create a dynamic, graphical display of disk space utilization. \
|
|
Using
|
|
# this list we:
|
|
#
|
|
# - Alter the phase of the moon.
|
|
# - Create new Filesystem objects.
|
|
# - Update date/time stamp, percent utilization and bar graphs of
|
|
# Filesystem objects.
|
|
# - Destroy Filesystem objects.
|
|
|
|
my($line, $fsr);
|
|
|
|
my(@active_Filesystem_objects) = ();
|
|
my $check_time = 1;
|
|
foreach $line (@FS_ATTENTION_DATA) {
|
|
chomp $line;
|
|
my($d1, $d2, $d3, $d4, $host, $fs, $pc) = split ' ', $line;
|
|
if ($check_time) { # rotate the moon if new `df' data h\
|
|
as arrived
|
|
$check_time = 0;
|
|
if ($OLD_TIME ne "$d1$d2$d3$d4") {
|
|
$THROB->configure(
|
|
-bitmap => "\@${LIBDIR}/${PATTERNS[++$PATTERN_MOD\
|
|
ULUS % 2]}",
|
|
);
|
|
$OLD_TIME = "$d1$d2$d3$d4";
|
|
}
|
|
} # ifend check_time
|
|
|
|
my $error = ($pc =~ /(\d+)%/) ? 0 : 1;
|
|
my $objname = "$host$fs";
|
|
if ($error) {
|
|
$objname = $line;
|
|
$objname =~ tr/ ./_/;
|
|
$objname = substr $objname, 22;
|
|
}
|
|
|
|
if (not ($fsr = Filesystem->find_object($objname))) {
|
|
$fsr = $MW->Filesystem(-object_name => $objname, -er\
|
|
ror => $error);
|
|
$fsr->grid(-in => $FS, -sticky => 'w');
|
|
}
|
|
if ($error) { # percent is not numeric, must be a `d\
|
|
f' error
|
|
$fsr->update_widget($line);
|
|
} else { # not an error line, but a "normal" `df' ou\
|
|
tput line
|
|
($pc) = $pc =~ /(\d+)%/; # remove %
|
|
$line = "$host:$fs $d1 $d2 $d3 $d4";
|
|
$fsr->update_widget($line, $pc, $COLORS[$pc - 90]);
|
|
}
|
|
push @active_Filesystem_objects, $fsr;
|
|
}; # forend all lines that need attention
|
|
|
|
Filesystem->delete_stale_objects(@active_Filesystem_objects);
|
|
|
|
} # end display_poll_results
|
|
|
|
sub end_monds {
|
|
|
|
# Quit, Ctrl/c or kill signal, finish up.
|
|
|
|
stop_tcp_clients;
|
|
$MW->destroy;
|
|
} # end end_monds
|
|
sub flash_widget {
|
|
# Flash a widget attribute periodically.
|
|
my($w, $option, $val1, $val2, $interval) = @ARG;
|
|
$w->configure($option => $val1);
|
|
return ($MW->after($interval, [\&flash_widget, $w, $option, $va\
|
|
l2, $val1,
|
|
$interval]));
|
|
} # end flash_widget
|
|
sub heuristics_say_so { # try to be smart about things
|
|
# The latest df sample shows a filesystem exceeding a particular s\
|
|
everity
|
|
# level. If the percent used appears to be static, then don't bot\
|
|
her the
|
|
# operator, who has lots of other data to look at. Only applicabl\
|
|
e if the
|
|
# severity level is 'Informative'.
|
|
#
|
|
# EXIT: 0, FALSE, do not display; 1, TRUE, display.
|
|
#
|
|
my($severity_level, $filesys, $used, $sample_count, @sample_list) \
|
|
= @ARG;
|
|
my $f; # filesystem name
|
|
my $fs; # filesystem
|
|
my $fsl; # filesystem list
|
|
my $p; # precent used
|
|
|
|
return 1 if $sample_count < $MAX_DF_SAMPLES; # display if too f\
|
|
ew samples
|
|
return 1 unless $SEVERITY{$severity_level} eq $SEVERITY{90};
|
|
INSPECT_ALL_DF_SAMPLES:
|
|
foreach $fsl (@sample_list) { # each sample of machine's filesyste\
|
|
ms
|
|
INSPECT_ALL_FILESYSTEMS_IN_THIS_SAMPLE:
|
|
foreach $fs (split /:/, $fsl) {
|
|
($f, $p) = split /=/, $fs;
|
|
next INSPECT_ALL_FILESYSTEMS_IN_THIS_SAMPLE unless
|
|
$f eq $filesys;
|
|
return 1 if $p != $used; # display if %used is changing
|
|
} # forend INSPECT_ALL_FILESYSTEMS_IN_THIS_SAMPLE
|
|
} # forend INSPECT_ALL_DF_SAMPLES
|
|
|
|
return 0; # don't dispay - %used is unchang\
|
|
ing
|
|
|
|
} # end heuristics_say_so
|
|
|
|
sub initialize { # preset monitor_disk_space
|
|
|
|
update_status 'Global Variables';
|
|
$SIG{INT} = \&end_monds;
|
|
$SIG{HUP} = \&end_monds;
|
|
$SIG{TERM} = \&end_monds;
|
|
$PORT = 10346;
|
|
$MAX_DF_SAMPLES = 15; # number of `df's per host
|
|
%SEVERITY = ( # severity cutoff hash
|
|
98 => '*** Critical ***',
|
|
96 => 'Urgent',
|
|
94 => 'Serious',
|
|
90 => 'Informative',
|
|
);
|
|
my($color_informative, $color_serious, $color_urgent, $color_criti\
|
|
cal) =
|
|
qw(green yellow orange red);
|
|
@COLORS = ($color_informative, $color_informative, $color_informat\
|
|
ive,
|
|
$color_informative, $color_serious, $color_serious,
|
|
$color_urgent, $color_urgent, $color_critical, $color_c\
|
|
ritical,
|
|
$color_critical);
|
|
my $time_string = &ctime(time);
|
|
@PATTERNS = ('monds_moon0.xbm', 'monds_moon1.xbm'); # ... o\
|
|
f moon phases
|
|
$PATTERN_MODULUS = -1; # phase modu\
|
|
lus
|
|
$OLD_TIME = ''; # time of last phase change
|
|
update_status 'Configuration File(s)';
|
|
umask 0022; # 022, base 8
|
|
$LAST_EF_MTIME = 1;
|
|
check_configuration_files;
|
|
|
|
start_tcp_clients;
|
|
construct_main_widgets;
|
|
} # end initialize
|
|
sub op_msg {
|
|
# Add another operator message to the message list.
|
|
my($op_msg_ref, $them, $msg) = @ARG;
|
|
push @{$op_msg_ref}, sprintf("%s %-20s%s\n", substr(&ctime(time)\
|
|
, 0, 19),
|
|
$them, $msg);
|
|
|
|
} # end op_msg
|
|
|
|
sub output {
|
|
|
|
# @FS_ATTENTION_DATA is a subset of MONDS_LOG lines with alert mes\
|
|
sages.
|
|
|
|
foreach (@ARG) {
|
|
push @FS_ATTENTION_DATA, $ARG if
|
|
/^Mon |^Tue |^Wed |^Thu |^Fri |^Sat |^Sun /;
|
|
}
|
|
} # end output
|
|
sub poll_clients {
|
|
no strict qw(refs);
|
|
# Here's where all the data is collected, sorted, massaged, inspec\
|
|
ted,
|
|
# rejected, chopped and diced. First, ask `select' if any
|
|
# asynchronous TCP/IP daemon has `df' data for us; assuming one or\
|
|
more
|
|
# of our pipes are hot, we:
|
|
#
|
|
# - Initialize the %alerts hash: its keys are the same as the
|
|
# %SEVERITY hash and its values are references to lists of strin\
|
|
gs
|
|
# of formatted filesystem data ready for writing to the log file
|
|
# and hence for displaying. The key values are simply the integ\
|
|
er
|
|
# threshold values marking different severity levels.
|
|
# - Make a pass through the host list looking at our read pipe for
|
|
# each host to see if it's been selected; if so:
|
|
# - Read the `df' data from the socket and weed out garbage hea\
|
|
der
|
|
# lines and filesystems we don't care about (like NFS mount
|
|
# points) and filesystems listed in the exclude file. Build \
|
|
a
|
|
# list of filesystem items of the form /fs=%used.
|
|
# - Analyze the host's current filesystem status.
|
|
# - Write the new monds log file and re-read the exclude_filesyste\
|
|
m
|
|
# configuration file if it has been modified since the last time\
|
|
around.
|
|
# - Finally, output a line to each of the asynchronous TCP/IP daem\
|
|
ons
|
|
# signaling it's OK to fetch more `df' data, display the current
|
|
# poll results and reschedule ourselves.
|
|
|
|
my($severity_level, @important_op_msgs, %alerts);
|
|
my(@fs_data, $ef, $rbits, $df_style);
|
|
my $nfound = select($rbits = $READ_BITS, undef, undef, undef);
|
|
return if $nfound == 0;
|
|
|
|
foreach $severity_level (keys %SEVERITY) {
|
|
$alerts{$severity_level} = undef; # no disk utilization alerts
|
|
}
|
|
@important_op_msgs = (); # cannot connect, df errors, etc.
|
|
my(@go_ahead) = (); # ACK these hosts for another `\
|
|
df'
|
|
my(@host_list) = @{$OPT{hosts}}; # poll kids for each host
|
|
my $poll_errors = 'Socket Timeout|Cannot Connect|Daemon Failure';
|
|
|
|
PROCESS_ALL_HOSTS:
|
|
while ($nfound > 0) {
|
|
my $them = shift(@host_list);
|
|
if (vec($rbits, fileno($CHILD{$them}->{pr}), 1) == 0) {
|
|
next PROCESS_ALL_HOSTS; # if no incoming data from this cl\
|
|
ient
|
|
}
|
|
$nfound--;
|
|
@fs_data = (); # initialize filesystem data for\
|
|
this host
|
|
push @go_ahead, $them; # give a "go-ahead" signal to th\
|
|
is task
|
|
my $fh = $CHILD{$them}->{pr}; # parent's (monds') read file\
|
|
handle
|
|
# Process all filesystems except AFS, NFS and those listed in \
|
|
the
|
|
# exclude file. Ignore the `df' header line and report any `d\
|
|
f'
|
|
# or socket I/O errors.
|
|
PROCESS_HOST_FILESYSTEMS:
|
|
while (<$fh>) {
|
|
last PROCESS_HOST_FILESYSTEMS if /^END_OF_DF$/;
|
|
unless (/^AFS|:\//) {
|
|
chop;
|
|
if (/^Filesystem/) {
|
|
$df_style = m/Avail/i ? 'BSD' : 'SYSV';
|
|
next PROCESS_HOST_FILESYSTEMS;
|
|
}
|
|
if (/^df: |NFS/) {
|
|
op_msg \@important_op_msgs, $them, $ARG;
|
|
next PROCESS_HOST_FILESYSTEMS;
|
|
}
|
|
if (/$poll_errors/) {
|
|
op_msg \@important_op_msgs, $them, $ARG;
|
|
next PROCESS_ALL_HOSTS;
|
|
}
|
|
# Extract data according to Unix flavor.
|
|
my($filesystem, $used, $mount, @l);
|
|
@l = split ' ';
|
|
if ($df_style eq 'BSD') {
|
|
($filesystem, $used, $mount) = @l[0,4,5];
|
|
} else {
|
|
($filesystem, $used, $mount) = @l[0,3,6];
|
|
}
|
|
chop $used; # remove %
|
|
|
|
# Ignore filesystems listed in the Exclude File.
|
|
|
|
EXCLUDE_SPECIAL_FILESYSTEMS:
|
|
foreach $ef (@{$EXCLUDE{$them}}) {
|
|
next PROCESS_HOST_FILESYSTEMS if $mount =~ $ef;
|
|
}
|
|
push @fs_data, "$mount=$used";
|
|
} # unlessend
|
|
} # whilend PROCESS_HOST_FILESYSTEMS
|
|
|
|
analyze_df_data \@fs_data, \%alerts, $them;
|
|
|
|
} # whilend PROCESS_ALL_HOSTS
|
|
|
|
write_log_file \@important_op_msgs, \%alerts;
|
|
check_configuration_files;
|
|
foreach (@go_ahead) {
|
|
print {$CHILD{$ARG}->{pw}} "\n";
|
|
}
|
|
display_poll_results;
|
|
} # end poll_clients
|
|
sub read_efcf {
|
|
# Read and process the commands in the Exclude Filesystem configur\
|
|
ation
|
|
# file; then process ALL_HOSTS if present.
|
|
|
|
undef %EXCLUDE;
|
|
my(%commands, $key, $host);
|
|
%commands = (
|
|
"exclude_filesystems|excf" => "exclude_filesystems_proc",
|
|
"#" => "comments_proc",
|
|
);
|
|
if ($OPT{exclude_filesystem_configuration_file} ne '$optional') {
|
|
open (I, "$OPT{exclude_filesystem_configuration_file}") or
|
|
abort "$OPT{exclude_filesystem_configuration_file} open";
|
|
EvaP_PAC "", \*I, %commands;
|
|
close I;
|
|
}
|
|
if (defined $EXCLUDE{'ALL_HOSTS'}) {
|
|
foreach $key (keys %EXCLUDE) {
|
|
next if $key eq 'ALL_HOSTS';
|
|
$EXCLUDE{$key} = [@{$EXCLUDE{$key}}, @{$EXCLUDE{'ALL_HOSTS\
|
|
'}}];
|
|
}
|
|
foreach $host (@{$OPT{hosts}}) {
|
|
$EXCLUDE{$host} = [@{$EXCLUDE{'ALL_HOSTS'}}] if not define\
|
|
d
|
|
$EXCLUDE{$host};
|
|
}
|
|
delete $EXCLUDE{'ALL_HOSTS'};
|
|
}
|
|
|
|
} # end read_efcf
|
|
|
|
sub start_tcp_clients {
|
|
|
|
no strict qw(refs);
|
|
|
|
# Start an asynchronous task for each monitored host, whose sole e\
|
|
xistence
|
|
# is to periodically connect to its companion machine and receive \
|
|
a
|
|
# current `df' output. These tasks then pipe the results back to \
|
|
us,
|
|
# monitor_disk_space, where we sit polling on a `select' for avail\
|
|
able
|
|
# data. This allows Tk events to flow even when a socket might be\
|
|
blocked.
|
|
|
|
$READ_BITS = ''; # bit list of parent input file ha\
|
|
ndles
|
|
my($fh) = ('fh0000'); # indirect file handle names
|
|
my($cr, $cw); # child read and write file handles
|
|
|
|
foreach (@{$OPT{hosts}}) {
|
|
update_status "TCP/IP Daemon For $ARG";
|
|
$cr = $fh++;
|
|
$CHILD{$ARG}->{pw} = $fh++;
|
|
pipe($cr, $CHILD{$ARG}->{pw}) or abort 'cr/pw pipe';
|
|
$CHILD{$ARG}->{pr} = $fh++;
|
|
$cw = $fh++;
|
|
pipe($CHILD{$ARG}->{pr}, $cw) or abort 'pr/cw pipe';
|
|
if ($CHILD{$ARG}->{pid} = fork) { # parent
|
|
close $cr;
|
|
close $cw;
|
|
$CHILD{$ARG}->{pw}->autoflush(1);
|
|
vec($READ_BITS, fileno($CHILD{$ARG}->{pr}), 1) = 1;
|
|
} elsif (defined($CHILD{$ARG}->{pid})) { # child
|
|
close $CHILD{$ARG}->{pr};
|
|
close $CHILD{$ARG}->{pw};
|
|
open(STDIN, "<&$cr") or abort 'STDIN open';
|
|
open(STDOUT, ">&$cw") or abort 'STDOUT open';
|
|
open(STDERR, ">&$cw") or abort 'STDERR open';
|
|
STDOUT->autoflush(1);
|
|
STDERR->autoflush(1);
|
|
exec("$LIBDIR/monds_client", $ARG, $PORT) or abort 'exec';
|
|
} else {
|
|
abort 'fork';
|
|
} # ifend fork
|
|
} # forend each monitored machine
|
|
|
|
} # end start_tcp_clients
|
|
|
|
sub stop_tcp_clients {
|
|
|
|
# Terminate all asynchronous TCP/IP clients.
|
|
|
|
foreach (@{$OPT{hosts}}) {
|
|
kill 'SIGTERM', $CHILD{$ARG}->{pid} if defined $CHILD{$ARG}\
|
|
->{pid};
|
|
}
|
|
|
|
} # end stop_tcp_clients
|
|
|
|
sub update_status {
|
|
|
|
my($status_text) = @ARG;
|
|
|
|
$STATUS->configure(-text => "$status_text ...");
|
|
$MW->update;
|
|
} # end update_status
|
|
sub write_log_file {
|
|
# Build the list @FS_ATTENTION_DATA used by display_poll_results.
|
|
my($important_op_msgs_ref, $alerts_ref) = @ARG;
|
|
@FS_ATTENTION_DATA = ();
|
|
if (@$important_op_msgs_ref) {
|
|
output @$important_op_msgs_ref;
|
|
}
|
|
my $alerts_header_printed = 0;
|
|
if (grep $ARG, values %{$alerts_ref}) {
|
|
$alerts_header_printed = 1;
|
|
}
|
|
|
|
my $severity_level;
|
|
foreach $severity_level (reverse sort(keys %SEVERITY)) {
|
|
if (defined @{$alerts_ref->{$severity_level}}) {
|
|
output @{$alerts_ref->{$severity_level}};
|
|
}
|
|
} # forend all severity levels
|
|
output "\n\n", substr(&ctime(time), 0, 19),
|
|
" Monitor Disk Space sees nothing unusual ...\n"
|
|
unless @$important_op_msgs_ref or $alerts_header_printed;
|
|
} # end write_log_file
|
|
# Embedded Evaluate Parameters command processors, in alphabetical ord\
|
|
er.
|
|
# Sure makes parsing configuration files a snap.
|
|
|
|
sub exclude_filesystems_proc {
|
|
|
|
package exclude_filesystems_pkg;
|
|
|
|
if (main::EvaP(\@exclude_filesystems_pkg::PDT,
|
|
\@exclude_filesystems_pkg::MM) != 1) {return;}
|
|
# Store the excluded filesystems in a hash indexed by hostname. W\
|
|
e need a
|
|
# new anonymous array reference since EvaP is now embedded and suc\
|
|
cessive
|
|
# calls return the SAME reference, thus wiping out previous result\
|
|
s.
|
|
$EXCLUDE{$exclude_filesystems_pkg::Options{'host'}} =
|
|
[@{$exclude_filesystems_pkg::Options{'filesystem'}}];
|
|
|
|
} # end exclude_filesystems_proc
|
|
|
|
sub comments_proc {
|
|
|
|
# Dumb procedure so you can have comments (commands by the name of\
|
|
"# ")
|
|
# in your embedded Evaluate Parameters program.
|
|
}; # end comments_proc
|
|
__END__
|
|
|
|
# monds Data Structures
|
|
#
|
|
# Note: 'hostname' = IP name/address of a monitored machine.
|
|
#
|
|
# %HOSTS
|
|
#
|
|
# A hash indexed by hostname. Each host entry is a si\
|
|
ngle string
|
|
# consisting of a list of percent-sign-separated `df'sam\
|
|
ples,
|
|
# each of which is a colon-separated list of filesystems\
|
|
of the
|
|
# form 'fs=used', where 'fs' is the filesystem name and \
|
|
'used'
|
|
# is a percentage indicating how full the filesystem is.
|
|
#
|
|
# This is Perl 4 parlence for a Perl 5 'hash of list o\
|
|
f list':
|
|
# each hash entry consists a major list of multiple `df'\
|
|
samples
|
|
# consisting of multiple filesystems, with each `df' sam\
|
|
ple
|
|
# demarcated by a percent sign (%) as the list separator\
|
|
.
|
|
# Within each `df' sample there are multiple colon-separ\
|
|
ated (:)
|
|
# filesystem entries of the form 'fs=used'.
|
|
#
|
|
# Now, assuming this is a typical `df' output for host\
|
|
'dillon':
|
|
#
|
|
# Filesystem Total KB free %used iused %iused \
|
|
Mounted on
|
|
# /dev/hd4 4096 1248 69% 679 66% \
|
|
/
|
|
# /dev/hd9var 12288 5004 59% 485 11% \
|
|
/var
|
|
# /dev/hd2 241664 1860 99% 11784 19% \
|
|
/usr
|
|
# /dev/hd3 12288 3208 73% 95 2% \
|
|
/tmp
|
|
# /dev/lv00 20480 2912 85% 1718 27% \
|
|
/var/vice
|
|
# AFS 72000000 72000000 0% 0 0%\
|
|
/afs
|
|
#
|
|
# the initial %HOSTS hash entry for 'dillon' might be:
|
|
#
|
|
# $HOSTS{'dillon'} = '/=69:/var=59:/usr=99:/tmp=73';
|
|
#
|
|
# subject to excluded filesystems, as well as "we don'\
|
|
t care"
|
|
# filesystems such as AFS and NFS mount points.
|
|
#
|
|
# Finally, note that we maintain $MAX_DF_SAMPLES of th\
|
|
is
|
|
# colon-separated list of filesystems, each separated by\
|
|
a
|
|
# percent-sign as the major-list-separator. Each 'maj\
|
|
or-list'
|
|
# `df' sample reflects the state of the machine's \
|
|
filesystems at
|
|
# approximately one-minute intervals; since we have this\
|
|
list of
|
|
# time-ordered filesystem data we can use heurist\
|
|
ics to
|
|
# determine, among other things, the rate of change of u\
|
|
sage.
|
|
#
|
|
# %EXCLUDE
|
|
#
|
|
# A hash indexed by hostname. Each host entry is a re\
|
|
ference to
|
|
# a list of filesystem names to exclude from considerati\
|
|
on by
|
|
# monds. For instance, your /usr partition might be rat\
|
|
her full
|
|
# but is in no danger of running out of space, so rather\
|
|
than
|
|
# adding extraneous "noise" to the monds display you wou\
|
|
ld
|
|
# exclude this filesystem. Similarly, sites using AFS t\
|
|
ypically
|
|
# would exclude the filesystem /var/vice/cache since it'\
|
|
s of a
|
|
# fixed size and cannot be overflowed. A standard hash \
|
|
entry
|
|
# might look like:
|
|
#
|
|
# $EXCLUDE{'dillon'} = $ref_to_list;
|
|
#
|
|
# where the scalar $ref_to_list is a reference to a 'l\
|
|
ist of
|
|
# filesystems'; e.g.
|
|
#
|
|
# ('/var/vice/cache', '/usr')
|
|
#
|
|
# %CHILD
|
|
#
|
|
# A Perl hash of hash, indexed by hostname and the fol\
|
|
lowing
|
|
# keys:
|
|
#
|
|
# {'dillon'}->{pid} = child process ID (for KILLing\
|
|
)
|
|
# {'dillon'}->{pr} = file handle of parent read pi\
|
|
pe
|
|
# {'dillon'}->{pw} = file handle of parent write p\
|
|
ipe
|
|
#
|
|
# %SEVERITY
|
|
#
|
|
# A hash indexed by severity level. Each monitored ma\
|
|
chines'
|
|
# filesystems are categorized by a percentage indicating\
|
|
how full
|
|
# that filesystem is, with 100% being "out of space". W\
|
|
hile the
|
|
# severity levels are arbitrary, the following scheme is\
|
|
used:
|
|
#
|
|
# %SEVERITY = (
|
|
# 98 => '*** Critical ***',
|
|
# 96 => 'Urgent',
|
|
# 94 => 'Serious',
|
|
# 90 => 'Informative',
|
|
# );
|
|
#
|
|
# A filesystem that reaches 90% of capacity is declare\
|
|
d to be of
|
|
# severity level 'Informative' - "Hello. I'm here."
|
|
#
|
|
# A filesystem that reaches 94% of capacity is 'Seriou\
|
|
s' - "Hmm,
|
|
# keep an eye on me."
|
|
#
|
|
# A filesystem that reaches 96% of capacity is 'Urgent\
|
|
' - "Wow,
|
|
# I'm definitely low on available disk space."
|
|
#
|
|
# A filesystem that reaches 98% of capacity is '*** Cr\
|
|
itical ***'
|
|
# - "Argh! I'm hurting now! Do something!"
|
|
#
|
|
# @FS_ATTENTION_DATA
|
|
#
|
|
# A list of monds log file lines extracted from the %ale\
|
|
rts hash.
|
|
# Used by display_poll_results() rather than actually re\
|
|
ading the
|
|
# log file. An "error" line and a "normal" line look li\
|
|
ke:
|
|
#
|
|
# Fri May 26 10:07:15 turkey Cannot Connect
|
|
# Fri May 26 10:07:15 cs1 /u5 \
|
|
98%
|