diff options
author | Fredrik Thulin <fredrik@thulin.net> | 2014-05-05 12:53:51 +0200 |
---|---|---|
committer | Fredrik Thulin <fredrik@thulin.net> | 2014-05-05 12:53:51 +0200 |
commit | d000b1adea71cc89f31d4509c135768ecc796ba3 (patch) | |
tree | ca607a5066b5696ef6a13bb3b13e64f1b38e04f3 /global/overlay/usr/local/bin | |
parent | 0e9331718dba878570505faa0e28fa57f1e6f762 (diff) |
Diffstat (limited to 'global/overlay/usr/local/bin')
-rwxr-xr-x | global/overlay/usr/local/bin/run-cosmos | 20 | ||||
-rwxr-xr-x | global/overlay/usr/local/bin/scriptherder | 722 |
2 files changed, 742 insertions, 0 deletions
diff --git a/global/overlay/usr/local/bin/run-cosmos b/global/overlay/usr/local/bin/run-cosmos new file mode 100755 index 0000000..df104b8 --- /dev/null +++ b/global/overlay/usr/local/bin/run-cosmos @@ -0,0 +1,20 @@ +#!/bin/sh +# +# Simplify running cosmos, with serialization if flock is available. +# + +set -e + +FLOCK=`which flock` + +if [ -x "$FLOCK" ]; then + ($FLOCK --exclusive --wait 60 9 || exit 1 + cosmos $* update + cosmos $* apply + )9>/var/lock/run-cosmos +else + cosmos $* update + cosmos $* apply +fi + +touch /var/run/last-cosmos-ok.stamp diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder new file mode 100755 index 0000000..c11383a --- /dev/null +++ b/global/overlay/usr/local/bin/scriptherder @@ -0,0 +1,722 @@ +#!/usr/bin/env python +# +# Copyright 2014 SUNET. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are +# permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, this list +# of conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY SUNET ``AS IS'' AND ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SUNET OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# The views and conclusions contained in the software and documentation are those of the +# authors and should not be interpreted as representing official policies, either expressed +# or implied, of SUNET. +# +# Author : Fredrik Thulin <fredrik@thulin.net> +# + +""" +Scriptherder can be run in one othe following modes: + + wrap -- Stores output, exit status etc. about a script invocation + ls -- Lists the logged script invocations + check -- Check if script execution results match given criterias, + output Nagios compatible result + +""" + +import os +import re +import sys +import time +import json +import logging +import logging.handlers +import argparse +import subprocess +import ConfigParser + +_defaults = {'debug': False, + 'syslog': False, + 'mode': 'ls', + 'datadir': '/var/cache/scriptherder', + 'checkdir': '/etc/scriptherder/check', + } + +_check_defaults = {'ok': 'exit_status=0,max_age=8h', + 'warning': 'exit_status=0,max_age=24h', + } + +exit_status = {'OK': 0, + 'WARNING': 1, + 'CRITICAL': 2, + 'UNKNOWN': 3, + } + + +class Job(object): + """ + Representation of an execution of a job. + """ + + def __init__(self, name, cmd=None): + if cmd is None: + cmd = [] + for x in cmd: + assert(isinstance(x, basestring)) + self._name = name + self._cmd = cmd + self._start_time = None + self._end_time = None + self._exit_status = None + self._pid = None + self._output = None + self._filename = None + self._output_filename = None + if self._name is None: + self._name = os.path.basename(self.cmd) + + def __repr__(self): + start = time.strftime('%Y-%m-%d %X', time.localtime(self.start_time)) + return '<{} instance at {:#x}: \'{name}\' start={start}, exit={exit}>'.format( + self.__class__.__name__, + id(self), + name=self.name, + start = start, + exit = self.exit_status, + ) + + def __str__(self): + start = time.strftime('%Y-%m-%d %X', time.localtime(self.start_time)) + return '\'{name}\' start={start}, duration={duration:>6}, exit={exit}'.format( + name = self.name, + start = start, + duration = self.duration_str, + exit = self.exit_status, + ) + + @property + def name(self): + """ + The name of the job. + + @rtype: string + """ + if self._name is None: + return self.cmd + return self._name + + @property + def cmd(self): + """ + The wrapped scripts name. + + @rtype: string + """ + return self._cmd[0] + + @property + def args(self): + """ + The wrapped scripts arguments. + + @rtype: [string] + """ + return self._cmd[1:] + + @property + def start_time(self): + """ + The start time of the script invocation. + + @rtype: int() or None + """ + if self._start_time is None: + return None + return int(self._start_time) + + @property + def end_time(self): + """ + The end time of the script invocation. + + @rtype: int() or None + """ + if self._end_time is None: + return None + return int(self._end_time) + + @property + def duration_str(self): + """ + Time spent executing job, as a human readable string. + + @rtype: string + """ + duration = self._end_time - self._start_time + if duration < 1: + # milliseconds + return '{:0.3f}ms'.format(duration * 1000) + return '{:0.3f}s'.format(duration) + + @property + def exit_status(self): + """ + The exit status of the script invocation. + + @rtype: int() or None + """ + return self._exit_status + + @property + def pid(self): + """ + The process ID of the script invocation. + + @rtype: int() or None + """ + return self._pid + + @property + def filename(self): + """ + The filename this job is stored in. + + @rtype: string or None + """ + return self._filename + + @property + def output(self): + """ + The output (STDOUT and STDERR) of the script invocation. + + @rtype: [string] + """ + if not self._output and self.output_filename: + f = open(self.output_filename, 'r') + self._output = f.read() + f.close() + return self._output + + @property + def output_filename(self): + """ + The name of the file holding the output (STDOUT and STDERR) of the script invocation. + + @rtype: [string] + """ + return self._output_filename + + def run(self): + """ + Run script, storing various aspects of the results. + """ + self._start_time = time.time() + proc = subprocess.Popen(self._cmd, + cwd='/', + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + close_fds=True, + ) + (stdout, _stderr) = proc.communicate() + self._end_time = time.time() + self._output = stdout + self._exit_status = proc.returncode + self._pid = proc.pid + + def save_to_file(self, datadir, logger, filename=None): + """ + Create a record with the details of a script invocation. + + @param datadir: Directory to keep records in + @param logger: logging logger + @param filename: Filename to use - default is reasonably constructed + + @type datadir: string + @type logger: logging.logger + @type filename: string or None + """ + if filename is None: + fn = '' + for x in self.name: + if x.isalnum(): + fn += x + else: + fn += '_' + filename = '{!s}_{!s}_{!s}'.format(fn, self.start_time, self.pid) + fn = os.path.join(datadir, filename) + logger.debug("Saving job metadata to file {!r}.tmp".format(fn)) + output_fn = fn + '_output' + f = open(fn + '.tmp', 'w') + data = {'name': self.name, + 'cmd': self._cmd, + 'start_time': self._start_time, + 'end_time': self._end_time, + 'pid': self.pid, + 'exit_status': self.exit_status, + 'version': 2, + } + if self._output: + data['output_filename'] = output_fn + '.data' + data['output_size'] = len(self._output) + f.write(json.dumps(data, indent = 4, sort_keys = True)) + f.write('\n') + f.close() + os.rename(fn + '.tmp', fn + '.json') + self._filename = fn + + if self._output: + logger.debug("Saving job output to file {!r}".format(output_fn)) + f = open(output_fn + '.tmp', 'w') + f.write(self._output) + f.close() + os.rename(output_fn + '.tmp', output_fn + '.data') + self._output_filename = output_fn + + def from_file(self, filename): + """ + Initialize this Job instance with data loaded from a file (previously created with + `save_to_file()'. + + @param filename: Filename to load data from + @type filename: string + + @rtype: Job + """ + f = open(filename, 'r') + data = json.loads(f.read(100 * 1024 * 1024)) + f.close() + if data.get('version') == 1: + self._name = data.get('name') + for x in data['cmd']: + assert(isinstance(x, basestring)) + self._cmd = data['cmd'] + self._start_time = data['start_time'] + self._end_time = data['end_time'] + self._pid = data['pid'] + self._exit_status = data['exit_status'] + self._output = data['output'] + self._output_filename = None + self._filename = filename + elif data.get('version') == 2: + self._name = data.get('name') + for x in data['cmd']: + assert(isinstance(x, basestring)) + self._cmd = data['cmd'] + self._start_time = data['start_time'] + self._end_time = data['end_time'] + self._pid = data['pid'] + self._exit_status = data['exit_status'] + self._output_filename = data.get('output_filename') + #self._output_size = data.get('output_size') # currently not used in scriptherder + self._filename = filename + else: + raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version'))) + return self + + +class Check(object): + """ + Conditions for the 'check' command. Loaded from file (one file per job name), + and used to check if a Job instance is OK or WARNING or ... + """ + + def __init__(self, filename, logger): + """ + Load check criteria from a file. + + Example file contents: + + [check] + ok = exit_status=0, max_age=8h + warning = exit_status=0, max_age=24h + + @param filename: INI file with check criterias for a specific job + @param logger: logging logger + + @type filename: string + @type logger: logging.logger + """ + self.logger = logger + self.config = ConfigParser.ConfigParser(_check_defaults) + if not self.config.read([filename]): + raise ValueError("Failed loading config file {!r}".format(filename)) + _section = 'check' + self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')] + self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')] + + def job_is_ok(self, job): + """ + Evaluate a Job against the OK criterias for this check. + + @type job: Job + + @rtype: bool + """ + res = True + for this in self._ok_criteria: + if not self._evaluate(this, job): + self.logger.debug("Job {!r} failed OK criteria {!r}".format(job, this)) + res = False + self.logger.debug("{!r} is OK result: {!r}".format(job, res)) + return res + + def job_is_warning(self, job): + """ + Evaluate a Job against the WARNING criterias for this check. + + @type job: Job + + @rtype: bool + """ + res = True + for this in self._warning_criteria: + if not self._evaluate(this, job): + self.logger.debug("Job {!r} failed WARNING criteria {!r}".format(job, this)) + res = False + self.logger.debug("{!r} is WARNING result: {!r}".format(job, res)) + return res + + def _evaluate(self, criteria, job): + """ + The actual evaluation engine. + + @param criteria: The criteria to test ('max_age=8h' for example) + @param job: The job + + @type criteria: string + @type job: Job + """ + (what, value) = criteria.split('=') + what.strip() + value.strip() + if what == 'exit_status': + value = int(value) + res = (job.exit_status == value) + self.logger.debug("Evaluate criteria {!r}: ({!r} == {!r}) {!r}".format( + criteria, job.exit_status, value, res)) + return res + elif what == 'max_age': + value = _parse_time_value(value) + now = int(time.time()) + res = (job.end_time > (now - value)) + self.logger.debug("Evaluate criteria {!r}: ({!r} > ({!r} - {!r}) {!r}".format( + criteria, job.end_time, now, value, res)) + return res + self.logger.debug("Evaluation of unknown criteria {!r}, defaulting to False".format(criteria)) + return False + + +def job_from_file(filename): + """ + Recreate Job() instance from saved file. + + @param filename: Filename to load script invocation details from + + @type filename: string + @rtype: Job + """ + job = Job('') + return job.from_file(filename) + + +def parse_args(defaults): + """ + Parse the command line arguments + + @param defaults: Argument defaults + + @type defaults: dict + """ + parser = argparse.ArgumentParser(description = 'Script herder script', + add_help = True, + formatter_class = argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument('--debug', + dest = 'debug', + action = 'store_true', default = defaults['debug'], + help = 'Enable debug operation', + ) + parser.add_argument('--syslog', + dest = 'syslog', + action = 'store_true', default = defaults['syslog'], + help = 'Enable syslog output', + ) + parser.add_argument('--mode', + dest = 'mode', + choices = ['wrap', 'ls', 'check'], default = defaults['mode'], + help = 'What mode to run in', + ) + parser.add_argument('-d', '--datadir', + dest = 'datadir', + default = defaults['datadir'], + help = 'Data directory', + metavar = 'PATH', + ) + parser.add_argument('--checkdir', + dest = 'checkdir', + default = defaults['checkdir'], + help = 'Check definitions directory', + metavar = 'PATH', + ) + parser.add_argument('-N', '--name', + dest = 'name', + help = 'Job name', + metavar = 'NAME', + ) + + parser.add_argument('cmd', + nargs = '*', default = [], + help = 'Script command', + metavar = 'CMD', + ) + + args = parser.parse_args() + return args + + +def mode_wrap(args, logger): + """ + Execute a job and save result state in a file. + + @param args: Parsed command line arguments + @param logger: logging logger + """ + job = Job(args.name, cmd=args.cmd) + logger.debug("Invoking '{!s}'".format(''.join(args.cmd))) + job.run() + logger.debug("Finished, exit status {!r}".format(job.exit_status)) + logger.debug("Job output:\n{!s}".format(job.output)) + job.save_to_file(args.datadir, logger) + return True + + +def mode_ls(args, logger): + """ + List all the saved states for jobs. + + @param args: Parsed command line arguments + @param logger: logging logger + """ + jobs = _get_job_results(args, logger) + for this in sorted(jobs, key=lambda x: x.start_time): + start = time.strftime('%Y-%m-%d %X', time.localtime(this.start_time)) + print('{start} {duration:>6} exit={exit} name={name} {filename}'.format( + start = start, + duration = this.duration_str, + exit = this.exit_status, + name = this.name, + filename = this.filename, + )) + return True + + +def mode_check(args, logger): + """ + Evaluate the stored states for either a specific job, or all jobs. + + Return Nagios compatible output (scriptherder --mode check is intended to + run using Nagios NRPE or similar). + + @param args: Parsed command line arguments + @param logger: logging logger + """ + jobs = _get_job_results(args, logger) + + # group the jobs by their name + by_name = {} + for this in jobs: + if this.name not in by_name: + by_name[this.name] = [] + by_name[this.name].append(this) + + total_ok = [] + total_warning = [] + total_critical = [] + + # determine total check status based on all logged invocations of this job + for (name, jobs) in by_name.items(): + # Sort jobs, oldest first + jobs = sorted(jobs, key=lambda x: x.start_time) + # Load the evaluation criterias for this job + check_filename = os.path.join(args.checkdir, name + '.ini') + logger.debug("Loading check definition from {!r}".format(check_filename)) + check = Check(check_filename, logger) + logger.debug("Checking {!r}: {!r}".format(name, jobs)) + + jobs_ok = [] + jobs_warning = [] + jobs_critical = [] + for job in jobs: + if check.job_is_ok(job): + jobs_ok.append(job) + elif check.job_is_warning(job): + jobs_warning.append(job) + else: + jobs_critical.append(job) + logger.debug("Raw status OK : {!r}".format(jobs_ok)) + logger.debug("Raw status WARN : {!r}".format(jobs_warning)) + logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical)) + if jobs_ok: + total_ok.append(jobs_ok[-1]) + elif jobs_warning: + total_warning.append(jobs_warning[-1]) + else: + total_critical.append(jobs_critical[-1]) + + if args.cmd: + # Single job check requested, output detailed information + if total_ok: + print('OK: {!s}'.format(total_ok[-1])) + return exit_status['OK'] + if total_warning: + print('WARNING: {!s}'.format(total_warning[-1])) + return exit_status['WARNING'] + if total_critical: + print('CRITICAL: {!s}'.format(total_critical[-1])) + return exit_status['CRITICAL'] + print "UNKNOWN - no jobs found for {!r}?".format(args.cmd) + return exit_status['UNKNOWN'] + + # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY + # job invocation is CRITICAL/WARNING, the aggregate message given to + # Nagios will have to be a failure. + if total_critical: + print("CRITICAL: {num} job(s) in this state: {names}".format( + num = len(total_critical), + names = ', '.join([str(x.name) for x in total_critical]), + )) + return exit_status['CRITICAL'] + if total_warning: + print("WARNING: {num} job(s) in this state: {names}".format( + num = len(total_warning), + names = ', '.join([str(x.name) for x in total_warning]), + )) + return exit_status['WARNING'] + if total_ok: + print("OK: {num} job(s) in this state: {names}".format( + num = len(total_ok), + names = ', '.join([x.name for x in total_ok]), + )) + return exit_status['OK'] + print "UNKNOWN - no jobs found?" + return exit_status['UNKNOWN'] + + +def _get_job_results(args, logger): + """ + Load all jobs matching any specified name on the command line. + + @param args: Parsed command line arguments + @param logger: logging logger + + @rtype: [Job] + """ + files = [f for f in os.listdir(args.datadir) if os.path.isfile(os.path.join(args.datadir, f))] + jobs = [] + for this in files: + if not this.endswith('.json'): + continue + filename = os.path.join(args.datadir, this) + job = job_from_file(filename) + if args.cmd: + if args.cmd[0] != job.name: + logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename)) + continue + jobs.append(job) + return jobs + + +def _parse_time_value(value): + """ + Parse time period strings such as 1d. A lone number is considered number of seconds. + + Return parsed value as number of seconds. + + @param value: Value to parse + @type value: string + @rtype: int + """ + match = re.match('^(\d+)([hmsd]*)$', value) + if match: + num = int(match.group(1)) + what = match.group(2) + if what == 'm': + return num * 60 + if what == 'h': + return num * 3600 + if what == 'd': + return num * 86400 + return num + + +def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults): + """ + Main entry point for either wrapping a script, or checking the status of it. + + @param myname: String, used for logging + @param args: Command line arguments + @param logger: logging logger + @param defaults: Default command line arguments + + @type myname: string + @type args: None or [string] + @type logger: logging.logger + @type defaults: dict + """ + if not args: + args = parse_args(defaults) + + # initialize various components + if not logger: + logger = logging.getLogger(myname) + if args.debug: + logger.setLevel(logging.DEBUG) + # log to stderr when debugging + formatter = logging.Formatter('%(asctime)s %(name)s %(threadName)s: %(levelname)s %(message)s') + stream_h = logging.StreamHandler(sys.stderr) + stream_h.setFormatter(formatter) + logger.addHandler(stream_h) + if args.syslog: + syslog_h = logging.handlers.SysLogHandler() + formatter = logging.Formatter('%(name)s: %(levelname)s %(message)s') + syslog_h.setFormatter(formatter) + logger.addHandler(syslog_h) + + if args.mode == 'wrap': + return mode_wrap(args, logger) + elif args.mode == 'ls': + return mode_ls(args, logger) + elif args.mode == 'check': + return mode_check(args, logger) + else: + logger.error("Invalid mode {!r}".format(args.mode)) + return False + +if __name__ == '__main__': + try: + progname = os.path.basename(sys.argv[0]) + res = main(progname) + if isinstance(res, int): + sys.exit(res) + if res: + sys.exit(0) + sys.exit(1) + except KeyboardInterrupt: + sys.exit(0) |