summaryrefslogtreecommitdiff
path: root/global/overlay/usr/local
diff options
context:
space:
mode:
Diffstat (limited to 'global/overlay/usr/local')
-rwxr-xr-xglobal/overlay/usr/local/bin/scriptherder302
1 files changed, 232 insertions, 70 deletions
diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder
index c11383a..1e00ec0 100755
--- a/global/overlay/usr/local/bin/scriptherder
+++ b/global/overlay/usr/local/bin/scriptherder
@@ -68,6 +68,28 @@ exit_status = {'OK': 0,
}
+class ScriptHerderError(Exception):
+ """
+ Base exception class for scriptherder.
+ """
+
+ def __init__(self, reason, filename):
+ self.reason = reason
+ self.filename = filename
+
+
+class JobLoadError(ScriptHerderError):
+ """
+ Raised when loading a job file fails.
+ """
+
+
+class CheckLoadError(ScriptHerderError):
+ """
+ Raised when loading a check file fails.
+ """
+
+
class Job(object):
"""
Representation of an execution of a job.
@@ -109,6 +131,21 @@ class Job(object):
exit = self.exit_status,
)
+ def status_summary(self):
+ """
+ Return short string with status of job.
+
+ E.g. 'name[exit=0,age=19h]'
+ """
+ if self._end_time is None or self._start_time is None:
+ return '{name}[not_running]'.format(name = self.name)
+ age = _time_to_str(time.time() - self._start_time)
+ return '{name}[exit={exit_status},age={age}]'.format(
+ name = self.name,
+ exit_status = self._exit_status,
+ age = age,
+ )
+
@property
def name(self):
"""
@@ -167,11 +204,10 @@ class Job(object):
@rtype: string
"""
+ if self._end_time is None or self._start_time is None:
+ return 'NaN'
duration = self._end_time - self._start_time
- if duration < 1:
- # milliseconds
- return '{:0.3f}ms'.format(duration * 1000)
- return '{:0.3f}s'.format(duration)
+ return _time_to_str(duration)
@property
def exit_status(self):
@@ -326,7 +362,7 @@ class Job(object):
#self._output_size = data.get('output_size') # currently not used in scriptherder
self._filename = filename
else:
- raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version')))
+ raise JobLoadError('Unknown version: {!r}'.format(data.get('version')), filename=filename)
return self
@@ -355,7 +391,7 @@ class Check(object):
self.logger = logger
self.config = ConfigParser.ConfigParser(_check_defaults)
if not self.config.read([filename]):
- raise ValueError("Failed loading config file {!r}".format(filename))
+ raise ScriptHerderError('Failed loading config file', filename)
_section = 'check'
self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')]
self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')]
@@ -422,6 +458,130 @@ class Check(object):
return False
+class CheckStatus(object):
+ """
+ Aggregated status of job invocations for --mode check.
+
+ Attributes:
+
+ checks_ok: List of checks in OK state ([Job()]).
+ checks_warning: List of checks in WARNING state ([Job()]).
+ checks_critical: List of checks in CRITICAL state ([Job()]).
+ """
+
+ def __init__(self, args, logger):
+ """
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+
+ self.checks_ok = []
+ self.checks_warning = []
+ self.checks_critical = []
+
+ self._jobs = _get_job_results(args, logger)
+ # group the jobs by their name
+ _by_name = {}
+ for this in self._jobs:
+ if this.name not in _by_name:
+ _by_name[this.name] = []
+ _by_name[this.name].append(this)
+ self._jobs_by_name = _by_name
+
+ self._job_count = len(_by_name)
+
+ self._check_running_jobs(args, logger)
+ if not args.cmd:
+ self._check_not_running(args, logger)
+
+ def _check_running_jobs(self, args, logger):
+ """
+ Look for job execution entrys (parsed into Job() instances), group them
+ per check name and determine the status. For each group, append status
+ to one of the three aggregate status lists of this object (checks_ok,
+ checks_warning or checks_critical).
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+ # determine total check status based on all logged invocations of this job
+ for (name, jobs) in self._jobs_by_name.items():
+ # Load the evaluation criterias for this job
+ check_filename = os.path.join(args.checkdir, name + '.ini')
+ logger.debug("Loading check definition from {!r}".format(check_filename))
+ try:
+ check = Check(check_filename, logger)
+ except ScriptHerderError as exc:
+ logger.warning("Failed loading check: {!r}".format(exc), exc_info=True)
+ raise CheckLoadError('Failed loading check', filename = check_filename)
+
+ # Sort jobs, oldest first
+ jobs = sorted(jobs, key=lambda x: x.start_time)
+ logger.debug("Checking {!r}: {!r}".format(name, jobs))
+
+ jobs_ok = []
+ jobs_warning = []
+ jobs_critical = []
+ for job in jobs:
+ if check.job_is_ok(job):
+ jobs_ok.append(job)
+ elif check.job_is_warning(job):
+ jobs_warning.append(job)
+ else:
+ jobs_critical.append(job)
+
+ logger.debug("Raw status OK : {!r}".format(jobs_ok))
+ logger.debug("Raw status WARN : {!r}".format(jobs_warning))
+ logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical))
+
+ # add most recent job status to the totals
+ if jobs_ok:
+ self.checks_ok.append(jobs_ok[-1])
+ elif jobs_warning:
+ self.checks_warning.append(jobs_warning[-1])
+ else:
+ self.checks_critical.append(jobs_critical[-1])
+
+ def _check_not_running(self, args, logger):
+ """
+ Look for job execution entrys (parsed into Job() instances), group them
+ per check name and determine the status. For each group, append status
+ to one of the three aggregate status lists of this object (checks_ok,
+ checks_warning or checks_critical).
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+ files = [f for f in os.listdir(args.checkdir) if os.path.isfile(os.path.join(args.checkdir, f))]
+ for this in files:
+ if not this.endswith('.ini'):
+ continue
+ filename = os.path.join(args.checkdir, this)
+ logger.debug("Loading check definition from {!r}".format(filename))
+ try:
+ # validate check loads
+ Check(filename, logger)
+ except ValueError as exc:
+ logger.warning("Failed loading check: {!r}".format(exc), exc_info=True)
+ raise CheckLoadError(filename = filename)
+ name = this[:-4] # remove the '.ini' suffix
+ if name not in self._jobs_by_name:
+ logger.debug('Check {!r} (filename {!r}) not found in jobs'.format(name, filename))
+ job = Job(name=name)
+ self.checks_critical.append(job)
+ self._job_count += 1
+ else:
+ logger.debug('Check {!r} has {!r} logged results'.format(name, len(self._jobs_by_name[name])))
+
+ def num_jobs(self):
+ """
+ Return number of jobs processed. This is number of different jobs running + not running.
+
+ @rtype: int
+ """
+ return self._job_count
+
+
def job_from_file(filename):
"""
Recreate Job() instance from saved file.
@@ -488,6 +648,7 @@ def parse_args(defaults):
)
args = parser.parse_args()
+
return args
@@ -537,88 +698,61 @@ def mode_check(args, logger):
@param args: Parsed command line arguments
@param logger: logging logger
"""
- jobs = _get_job_results(args, logger)
- # group the jobs by their name
- by_name = {}
- for this in jobs:
- if this.name not in by_name:
- by_name[this.name] = []
- by_name[this.name].append(this)
-
- total_ok = []
- total_warning = []
- total_critical = []
-
- # determine total check status based on all logged invocations of this job
- for (name, jobs) in by_name.items():
- # Sort jobs, oldest first
- jobs = sorted(jobs, key=lambda x: x.start_time)
- # Load the evaluation criterias for this job
- check_filename = os.path.join(args.checkdir, name + '.ini')
- logger.debug("Loading check definition from {!r}".format(check_filename))
- check = Check(check_filename, logger)
- logger.debug("Checking {!r}: {!r}".format(name, jobs))
-
- jobs_ok = []
- jobs_warning = []
- jobs_critical = []
- for job in jobs:
- if check.job_is_ok(job):
- jobs_ok.append(job)
- elif check.job_is_warning(job):
- jobs_warning.append(job)
- else:
- jobs_critical.append(job)
- logger.debug("Raw status OK : {!r}".format(jobs_ok))
- logger.debug("Raw status WARN : {!r}".format(jobs_warning))
- logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical))
- if jobs_ok:
- total_ok.append(jobs_ok[-1])
- elif jobs_warning:
- total_warning.append(jobs_warning[-1])
- else:
- total_critical.append(jobs_critical[-1])
+ try:
+ status = CheckStatus(args, logger)
+ except CheckLoadError as exc:
+ print("UNKNOWN: Failed loading check from file '{!s}' ({!s})".format(exc.filename, exc.reason))
+ return exit_status['UNKNOWN']
if args.cmd:
# Single job check requested, output detailed information
- if total_ok:
- print('OK: {!s}'.format(total_ok[-1]))
+ if status.checks_ok:
+ print('OK: {!s}'.format(status.checks_ok[-1]))
return exit_status['OK']
- if total_warning:
- print('WARNING: {!s}'.format(total_warning[-1]))
+ if status.checks_warning:
+ print('WARNING: {!s}'.format(status.checks_warning[-1]))
return exit_status['WARNING']
- if total_critical:
- print('CRITICAL: {!s}'.format(total_critical[-1]))
+ if status.checks_critical:
+ print('CRITICAL: {!s}'.format(status.checks_critical[-1]))
return exit_status['CRITICAL']
print "UNKNOWN - no jobs found for {!r}?".format(args.cmd)
return exit_status['UNKNOWN']
- # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY
+ # When looking at multiple jobs at once, logic gets a bit reversed - if ANY
# job invocation is CRITICAL/WARNING, the aggregate message given to
# Nagios will have to be a failure.
- if total_critical:
- print("CRITICAL: {num} job(s) in this state: {names}".format(
- num = len(total_critical),
- names = ', '.join([str(x.name) for x in total_critical]),
- ))
+ if status.checks_critical:
+ print('CRITICAL: {!s}'.format(
+ _status_summary(status.num_jobs(), status.checks_critical)))
return exit_status['CRITICAL']
- if total_warning:
- print("WARNING: {num} job(s) in this state: {names}".format(
- num = len(total_warning),
- names = ', '.join([str(x.name) for x in total_warning]),
- ))
+ if status.checks_warning:
+ print('WARNING: {!s}'.format(
+ _status_summary(status.num_jobs(), status.checks_warning)))
return exit_status['WARNING']
- if total_ok:
- print("OK: {num} job(s) in this state: {names}".format(
- num = len(total_ok),
- names = ', '.join([x.name for x in total_ok]),
- ))
+ if status.checks_ok:
+ print('OK: {!s}'.format(
+ _status_summary(status.num_jobs(), status.checks_ok)))
return exit_status['OK']
print "UNKNOWN - no jobs found?"
return exit_status['UNKNOWN']
+def _status_summary(num_jobs, failed):
+ """
+ String format routine used in output of checks status.
+ """
+ fmt = '1 job in this state: {summary}'
+ if len(failed) == 1:
+ fmt = '{jobs}/{num_jobs} job in this state: {summary}'
+
+ summary = ', '.join(sorted([str(x.status_summary()) for x in failed]))
+ return fmt.format(jobs = len(failed),
+ num_jobs = num_jobs,
+ summary = summary,
+ )
+
+
def _get_job_results(args, logger):
"""
Load all jobs matching any specified name on the command line.
@@ -634,7 +768,10 @@ def _get_job_results(args, logger):
if not this.endswith('.json'):
continue
filename = os.path.join(args.datadir, this)
- job = job_from_file(filename)
+ try:
+ job = job_from_file(filename)
+ except JobLoadError as exc:
+ logger.warning("Failed loading job file '{!s}' ({!s})".format(exc.filename, exc.reason))
if args.cmd:
if args.cmd[0] != job.name:
logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename))
@@ -666,6 +803,27 @@ def _parse_time_value(value):
return num
+def _time_to_str(value):
+ """
+ Format number of seconds to short readable string.
+
+ @type value: float or int
+
+ @rtype: string
+ """
+ if value < 1:
+ # milliseconds
+ return '{:0.3f}ms'.format(value * 1000)
+ if value < 60:
+ return '{!s}s'.format(int(value))
+ if value < 3600:
+ return '{!s}m'.format(int(value))
+ if value < 86400:
+ return '{!s}h'.format(int(value / 3600))
+ days = int(value / 86400)
+ return '{!s}d{!s}h'.format(days, int((value % 86400) / 3600))
+
+
def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults):
"""
Main entry point for either wrapping a script, or checking the status of it.
@@ -699,6 +857,10 @@ def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults
syslog_h.setFormatter(formatter)
logger.addHandler(syslog_h)
+ if args.name and args.mode != 'wrap':
+ logger.error('Argument --name only applicable for --mode wrap')
+ return False
+
if args.mode == 'wrap':
return mode_wrap(args, logger)
elif args.mode == 'ls':