summaryrefslogtreecommitdiff
path: root/global/overlay/usr
diff options
context:
space:
mode:
authorFredrik Thulin <fredrik@thulin.net>2014-05-05 12:53:51 +0200
committerFredrik Thulin <fredrik@thulin.net>2014-05-05 12:53:51 +0200
commitd000b1adea71cc89f31d4509c135768ecc796ba3 (patch)
treeca607a5066b5696ef6a13bb3b13e64f1b38e04f3 /global/overlay/usr
parent0e9331718dba878570505faa0e28fa57f1e6f762 (diff)
Diffstat (limited to 'global/overlay/usr')
-rwxr-xr-xglobal/overlay/usr/local/bin/run-cosmos20
-rwxr-xr-xglobal/overlay/usr/local/bin/scriptherder722
2 files changed, 742 insertions, 0 deletions
diff --git a/global/overlay/usr/local/bin/run-cosmos b/global/overlay/usr/local/bin/run-cosmos
new file mode 100755
index 0000000..df104b8
--- /dev/null
+++ b/global/overlay/usr/local/bin/run-cosmos
@@ -0,0 +1,20 @@
+#!/bin/sh
+#
+# Simplify running cosmos, with serialization if flock is available.
+#
+
+set -e
+
+FLOCK=`which flock`
+
+if [ -x "$FLOCK" ]; then
+ ($FLOCK --exclusive --wait 60 9 || exit 1
+ cosmos $* update
+ cosmos $* apply
+ )9>/var/lock/run-cosmos
+else
+ cosmos $* update
+ cosmos $* apply
+fi
+
+touch /var/run/last-cosmos-ok.stamp
diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder
new file mode 100755
index 0000000..c11383a
--- /dev/null
+++ b/global/overlay/usr/local/bin/scriptherder
@@ -0,0 +1,722 @@
+#!/usr/bin/env python
+#
+# Copyright 2014 SUNET. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without modification, are
+# permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this list of
+# conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice, this list
+# of conditions and the following disclaimer in the documentation and/or other materials
+# provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY SUNET ``AS IS'' AND ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SUNET OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# The views and conclusions contained in the software and documentation are those of the
+# authors and should not be interpreted as representing official policies, either expressed
+# or implied, of SUNET.
+#
+# Author : Fredrik Thulin <fredrik@thulin.net>
+#
+
+"""
+Scriptherder can be run in one othe following modes:
+
+ wrap -- Stores output, exit status etc. about a script invocation
+ ls -- Lists the logged script invocations
+ check -- Check if script execution results match given criterias,
+ output Nagios compatible result
+
+"""
+
+import os
+import re
+import sys
+import time
+import json
+import logging
+import logging.handlers
+import argparse
+import subprocess
+import ConfigParser
+
+_defaults = {'debug': False,
+ 'syslog': False,
+ 'mode': 'ls',
+ 'datadir': '/var/cache/scriptherder',
+ 'checkdir': '/etc/scriptherder/check',
+ }
+
+_check_defaults = {'ok': 'exit_status=0,max_age=8h',
+ 'warning': 'exit_status=0,max_age=24h',
+ }
+
+exit_status = {'OK': 0,
+ 'WARNING': 1,
+ 'CRITICAL': 2,
+ 'UNKNOWN': 3,
+ }
+
+
+class Job(object):
+ """
+ Representation of an execution of a job.
+ """
+
+ def __init__(self, name, cmd=None):
+ if cmd is None:
+ cmd = []
+ for x in cmd:
+ assert(isinstance(x, basestring))
+ self._name = name
+ self._cmd = cmd
+ self._start_time = None
+ self._end_time = None
+ self._exit_status = None
+ self._pid = None
+ self._output = None
+ self._filename = None
+ self._output_filename = None
+ if self._name is None:
+ self._name = os.path.basename(self.cmd)
+
+ def __repr__(self):
+ start = time.strftime('%Y-%m-%d %X', time.localtime(self.start_time))
+ return '<{} instance at {:#x}: \'{name}\' start={start}, exit={exit}>'.format(
+ self.__class__.__name__,
+ id(self),
+ name=self.name,
+ start = start,
+ exit = self.exit_status,
+ )
+
+ def __str__(self):
+ start = time.strftime('%Y-%m-%d %X', time.localtime(self.start_time))
+ return '\'{name}\' start={start}, duration={duration:>6}, exit={exit}'.format(
+ name = self.name,
+ start = start,
+ duration = self.duration_str,
+ exit = self.exit_status,
+ )
+
+ @property
+ def name(self):
+ """
+ The name of the job.
+
+ @rtype: string
+ """
+ if self._name is None:
+ return self.cmd
+ return self._name
+
+ @property
+ def cmd(self):
+ """
+ The wrapped scripts name.
+
+ @rtype: string
+ """
+ return self._cmd[0]
+
+ @property
+ def args(self):
+ """
+ The wrapped scripts arguments.
+
+ @rtype: [string]
+ """
+ return self._cmd[1:]
+
+ @property
+ def start_time(self):
+ """
+ The start time of the script invocation.
+
+ @rtype: int() or None
+ """
+ if self._start_time is None:
+ return None
+ return int(self._start_time)
+
+ @property
+ def end_time(self):
+ """
+ The end time of the script invocation.
+
+ @rtype: int() or None
+ """
+ if self._end_time is None:
+ return None
+ return int(self._end_time)
+
+ @property
+ def duration_str(self):
+ """
+ Time spent executing job, as a human readable string.
+
+ @rtype: string
+ """
+ duration = self._end_time - self._start_time
+ if duration < 1:
+ # milliseconds
+ return '{:0.3f}ms'.format(duration * 1000)
+ return '{:0.3f}s'.format(duration)
+
+ @property
+ def exit_status(self):
+ """
+ The exit status of the script invocation.
+
+ @rtype: int() or None
+ """
+ return self._exit_status
+
+ @property
+ def pid(self):
+ """
+ The process ID of the script invocation.
+
+ @rtype: int() or None
+ """
+ return self._pid
+
+ @property
+ def filename(self):
+ """
+ The filename this job is stored in.
+
+ @rtype: string or None
+ """
+ return self._filename
+
+ @property
+ def output(self):
+ """
+ The output (STDOUT and STDERR) of the script invocation.
+
+ @rtype: [string]
+ """
+ if not self._output and self.output_filename:
+ f = open(self.output_filename, 'r')
+ self._output = f.read()
+ f.close()
+ return self._output
+
+ @property
+ def output_filename(self):
+ """
+ The name of the file holding the output (STDOUT and STDERR) of the script invocation.
+
+ @rtype: [string]
+ """
+ return self._output_filename
+
+ def run(self):
+ """
+ Run script, storing various aspects of the results.
+ """
+ self._start_time = time.time()
+ proc = subprocess.Popen(self._cmd,
+ cwd='/',
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ close_fds=True,
+ )
+ (stdout, _stderr) = proc.communicate()
+ self._end_time = time.time()
+ self._output = stdout
+ self._exit_status = proc.returncode
+ self._pid = proc.pid
+
+ def save_to_file(self, datadir, logger, filename=None):
+ """
+ Create a record with the details of a script invocation.
+
+ @param datadir: Directory to keep records in
+ @param logger: logging logger
+ @param filename: Filename to use - default is reasonably constructed
+
+ @type datadir: string
+ @type logger: logging.logger
+ @type filename: string or None
+ """
+ if filename is None:
+ fn = ''
+ for x in self.name:
+ if x.isalnum():
+ fn += x
+ else:
+ fn += '_'
+ filename = '{!s}_{!s}_{!s}'.format(fn, self.start_time, self.pid)
+ fn = os.path.join(datadir, filename)
+ logger.debug("Saving job metadata to file {!r}.tmp".format(fn))
+ output_fn = fn + '_output'
+ f = open(fn + '.tmp', 'w')
+ data = {'name': self.name,
+ 'cmd': self._cmd,
+ 'start_time': self._start_time,
+ 'end_time': self._end_time,
+ 'pid': self.pid,
+ 'exit_status': self.exit_status,
+ 'version': 2,
+ }
+ if self._output:
+ data['output_filename'] = output_fn + '.data'
+ data['output_size'] = len(self._output)
+ f.write(json.dumps(data, indent = 4, sort_keys = True))
+ f.write('\n')
+ f.close()
+ os.rename(fn + '.tmp', fn + '.json')
+ self._filename = fn
+
+ if self._output:
+ logger.debug("Saving job output to file {!r}".format(output_fn))
+ f = open(output_fn + '.tmp', 'w')
+ f.write(self._output)
+ f.close()
+ os.rename(output_fn + '.tmp', output_fn + '.data')
+ self._output_filename = output_fn
+
+ def from_file(self, filename):
+ """
+ Initialize this Job instance with data loaded from a file (previously created with
+ `save_to_file()'.
+
+ @param filename: Filename to load data from
+ @type filename: string
+
+ @rtype: Job
+ """
+ f = open(filename, 'r')
+ data = json.loads(f.read(100 * 1024 * 1024))
+ f.close()
+ if data.get('version') == 1:
+ self._name = data.get('name')
+ for x in data['cmd']:
+ assert(isinstance(x, basestring))
+ self._cmd = data['cmd']
+ self._start_time = data['start_time']
+ self._end_time = data['end_time']
+ self._pid = data['pid']
+ self._exit_status = data['exit_status']
+ self._output = data['output']
+ self._output_filename = None
+ self._filename = filename
+ elif data.get('version') == 2:
+ self._name = data.get('name')
+ for x in data['cmd']:
+ assert(isinstance(x, basestring))
+ self._cmd = data['cmd']
+ self._start_time = data['start_time']
+ self._end_time = data['end_time']
+ self._pid = data['pid']
+ self._exit_status = data['exit_status']
+ self._output_filename = data.get('output_filename')
+ #self._output_size = data.get('output_size') # currently not used in scriptherder
+ self._filename = filename
+ else:
+ raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version')))
+ return self
+
+
+class Check(object):
+ """
+ Conditions for the 'check' command. Loaded from file (one file per job name),
+ and used to check if a Job instance is OK or WARNING or ...
+ """
+
+ def __init__(self, filename, logger):
+ """
+ Load check criteria from a file.
+
+ Example file contents:
+
+ [check]
+ ok = exit_status=0, max_age=8h
+ warning = exit_status=0, max_age=24h
+
+ @param filename: INI file with check criterias for a specific job
+ @param logger: logging logger
+
+ @type filename: string
+ @type logger: logging.logger
+ """
+ self.logger = logger
+ self.config = ConfigParser.ConfigParser(_check_defaults)
+ if not self.config.read([filename]):
+ raise ValueError("Failed loading config file {!r}".format(filename))
+ _section = 'check'
+ self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')]
+ self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')]
+
+ def job_is_ok(self, job):
+ """
+ Evaluate a Job against the OK criterias for this check.
+
+ @type job: Job
+
+ @rtype: bool
+ """
+ res = True
+ for this in self._ok_criteria:
+ if not self._evaluate(this, job):
+ self.logger.debug("Job {!r} failed OK criteria {!r}".format(job, this))
+ res = False
+ self.logger.debug("{!r} is OK result: {!r}".format(job, res))
+ return res
+
+ def job_is_warning(self, job):
+ """
+ Evaluate a Job against the WARNING criterias for this check.
+
+ @type job: Job
+
+ @rtype: bool
+ """
+ res = True
+ for this in self._warning_criteria:
+ if not self._evaluate(this, job):
+ self.logger.debug("Job {!r} failed WARNING criteria {!r}".format(job, this))
+ res = False
+ self.logger.debug("{!r} is WARNING result: {!r}".format(job, res))
+ return res
+
+ def _evaluate(self, criteria, job):
+ """
+ The actual evaluation engine.
+
+ @param criteria: The criteria to test ('max_age=8h' for example)
+ @param job: The job
+
+ @type criteria: string
+ @type job: Job
+ """
+ (what, value) = criteria.split('=')
+ what.strip()
+ value.strip()
+ if what == 'exit_status':
+ value = int(value)
+ res = (job.exit_status == value)
+ self.logger.debug("Evaluate criteria {!r}: ({!r} == {!r}) {!r}".format(
+ criteria, job.exit_status, value, res))
+ return res
+ elif what == 'max_age':
+ value = _parse_time_value(value)
+ now = int(time.time())
+ res = (job.end_time > (now - value))
+ self.logger.debug("Evaluate criteria {!r}: ({!r} > ({!r} - {!r}) {!r}".format(
+ criteria, job.end_time, now, value, res))
+ return res
+ self.logger.debug("Evaluation of unknown criteria {!r}, defaulting to False".format(criteria))
+ return False
+
+
+def job_from_file(filename):
+ """
+ Recreate Job() instance from saved file.
+
+ @param filename: Filename to load script invocation details from
+
+ @type filename: string
+ @rtype: Job
+ """
+ job = Job('')
+ return job.from_file(filename)
+
+
+def parse_args(defaults):
+ """
+ Parse the command line arguments
+
+ @param defaults: Argument defaults
+
+ @type defaults: dict
+ """
+ parser = argparse.ArgumentParser(description = 'Script herder script',
+ add_help = True,
+ formatter_class = argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ parser.add_argument('--debug',
+ dest = 'debug',
+ action = 'store_true', default = defaults['debug'],
+ help = 'Enable debug operation',
+ )
+ parser.add_argument('--syslog',
+ dest = 'syslog',
+ action = 'store_true', default = defaults['syslog'],
+ help = 'Enable syslog output',
+ )
+ parser.add_argument('--mode',
+ dest = 'mode',
+ choices = ['wrap', 'ls', 'check'], default = defaults['mode'],
+ help = 'What mode to run in',
+ )
+ parser.add_argument('-d', '--datadir',
+ dest = 'datadir',
+ default = defaults['datadir'],
+ help = 'Data directory',
+ metavar = 'PATH',
+ )
+ parser.add_argument('--checkdir',
+ dest = 'checkdir',
+ default = defaults['checkdir'],
+ help = 'Check definitions directory',
+ metavar = 'PATH',
+ )
+ parser.add_argument('-N', '--name',
+ dest = 'name',
+ help = 'Job name',
+ metavar = 'NAME',
+ )
+
+ parser.add_argument('cmd',
+ nargs = '*', default = [],
+ help = 'Script command',
+ metavar = 'CMD',
+ )
+
+ args = parser.parse_args()
+ return args
+
+
+def mode_wrap(args, logger):
+ """
+ Execute a job and save result state in a file.
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+ job = Job(args.name, cmd=args.cmd)
+ logger.debug("Invoking '{!s}'".format(''.join(args.cmd)))
+ job.run()
+ logger.debug("Finished, exit status {!r}".format(job.exit_status))
+ logger.debug("Job output:\n{!s}".format(job.output))
+ job.save_to_file(args.datadir, logger)
+ return True
+
+
+def mode_ls(args, logger):
+ """
+ List all the saved states for jobs.
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+ jobs = _get_job_results(args, logger)
+ for this in sorted(jobs, key=lambda x: x.start_time):
+ start = time.strftime('%Y-%m-%d %X', time.localtime(this.start_time))
+ print('{start} {duration:>6} exit={exit} name={name} {filename}'.format(
+ start = start,
+ duration = this.duration_str,
+ exit = this.exit_status,
+ name = this.name,
+ filename = this.filename,
+ ))
+ return True
+
+
+def mode_check(args, logger):
+ """
+ Evaluate the stored states for either a specific job, or all jobs.
+
+ Return Nagios compatible output (scriptherder --mode check is intended to
+ run using Nagios NRPE or similar).
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+ """
+ jobs = _get_job_results(args, logger)
+
+ # group the jobs by their name
+ by_name = {}
+ for this in jobs:
+ if this.name not in by_name:
+ by_name[this.name] = []
+ by_name[this.name].append(this)
+
+ total_ok = []
+ total_warning = []
+ total_critical = []
+
+ # determine total check status based on all logged invocations of this job
+ for (name, jobs) in by_name.items():
+ # Sort jobs, oldest first
+ jobs = sorted(jobs, key=lambda x: x.start_time)
+ # Load the evaluation criterias for this job
+ check_filename = os.path.join(args.checkdir, name + '.ini')
+ logger.debug("Loading check definition from {!r}".format(check_filename))
+ check = Check(check_filename, logger)
+ logger.debug("Checking {!r}: {!r}".format(name, jobs))
+
+ jobs_ok = []
+ jobs_warning = []
+ jobs_critical = []
+ for job in jobs:
+ if check.job_is_ok(job):
+ jobs_ok.append(job)
+ elif check.job_is_warning(job):
+ jobs_warning.append(job)
+ else:
+ jobs_critical.append(job)
+ logger.debug("Raw status OK : {!r}".format(jobs_ok))
+ logger.debug("Raw status WARN : {!r}".format(jobs_warning))
+ logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical))
+ if jobs_ok:
+ total_ok.append(jobs_ok[-1])
+ elif jobs_warning:
+ total_warning.append(jobs_warning[-1])
+ else:
+ total_critical.append(jobs_critical[-1])
+
+ if args.cmd:
+ # Single job check requested, output detailed information
+ if total_ok:
+ print('OK: {!s}'.format(total_ok[-1]))
+ return exit_status['OK']
+ if total_warning:
+ print('WARNING: {!s}'.format(total_warning[-1]))
+ return exit_status['WARNING']
+ if total_critical:
+ print('CRITICAL: {!s}'.format(total_critical[-1]))
+ return exit_status['CRITICAL']
+ print "UNKNOWN - no jobs found for {!r}?".format(args.cmd)
+ return exit_status['UNKNOWN']
+
+ # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY
+ # job invocation is CRITICAL/WARNING, the aggregate message given to
+ # Nagios will have to be a failure.
+ if total_critical:
+ print("CRITICAL: {num} job(s) in this state: {names}".format(
+ num = len(total_critical),
+ names = ', '.join([str(x.name) for x in total_critical]),
+ ))
+ return exit_status['CRITICAL']
+ if total_warning:
+ print("WARNING: {num} job(s) in this state: {names}".format(
+ num = len(total_warning),
+ names = ', '.join([str(x.name) for x in total_warning]),
+ ))
+ return exit_status['WARNING']
+ if total_ok:
+ print("OK: {num} job(s) in this state: {names}".format(
+ num = len(total_ok),
+ names = ', '.join([x.name for x in total_ok]),
+ ))
+ return exit_status['OK']
+ print "UNKNOWN - no jobs found?"
+ return exit_status['UNKNOWN']
+
+
+def _get_job_results(args, logger):
+ """
+ Load all jobs matching any specified name on the command line.
+
+ @param args: Parsed command line arguments
+ @param logger: logging logger
+
+ @rtype: [Job]
+ """
+ files = [f for f in os.listdir(args.datadir) if os.path.isfile(os.path.join(args.datadir, f))]
+ jobs = []
+ for this in files:
+ if not this.endswith('.json'):
+ continue
+ filename = os.path.join(args.datadir, this)
+ job = job_from_file(filename)
+ if args.cmd:
+ if args.cmd[0] != job.name:
+ logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename))
+ continue
+ jobs.append(job)
+ return jobs
+
+
+def _parse_time_value(value):
+ """
+ Parse time period strings such as 1d. A lone number is considered number of seconds.
+
+ Return parsed value as number of seconds.
+
+ @param value: Value to parse
+ @type value: string
+ @rtype: int
+ """
+ match = re.match('^(\d+)([hmsd]*)$', value)
+ if match:
+ num = int(match.group(1))
+ what = match.group(2)
+ if what == 'm':
+ return num * 60
+ if what == 'h':
+ return num * 3600
+ if what == 'd':
+ return num * 86400
+ return num
+
+
+def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults):
+ """
+ Main entry point for either wrapping a script, or checking the status of it.
+
+ @param myname: String, used for logging
+ @param args: Command line arguments
+ @param logger: logging logger
+ @param defaults: Default command line arguments
+
+ @type myname: string
+ @type args: None or [string]
+ @type logger: logging.logger
+ @type defaults: dict
+ """
+ if not args:
+ args = parse_args(defaults)
+
+ # initialize various components
+ if not logger:
+ logger = logging.getLogger(myname)
+ if args.debug:
+ logger.setLevel(logging.DEBUG)
+ # log to stderr when debugging
+ formatter = logging.Formatter('%(asctime)s %(name)s %(threadName)s: %(levelname)s %(message)s')
+ stream_h = logging.StreamHandler(sys.stderr)
+ stream_h.setFormatter(formatter)
+ logger.addHandler(stream_h)
+ if args.syslog:
+ syslog_h = logging.handlers.SysLogHandler()
+ formatter = logging.Formatter('%(name)s: %(levelname)s %(message)s')
+ syslog_h.setFormatter(formatter)
+ logger.addHandler(syslog_h)
+
+ if args.mode == 'wrap':
+ return mode_wrap(args, logger)
+ elif args.mode == 'ls':
+ return mode_ls(args, logger)
+ elif args.mode == 'check':
+ return mode_check(args, logger)
+ else:
+ logger.error("Invalid mode {!r}".format(args.mode))
+ return False
+
+if __name__ == '__main__':
+ try:
+ progname = os.path.basename(sys.argv[0])
+ res = main(progname)
+ if isinstance(res, int):
+ sys.exit(res)
+ if res:
+ sys.exit(0)
+ sys.exit(1)
+ except KeyboardInterrupt:
+ sys.exit(0)