diff options
-rw-r--r-- | global/overlay/etc/cron.d/cosmos | 2 | ||||
-rwxr-xr-x | global/overlay/etc/cron.daily/scriptherder_cleanup | 10 | ||||
-rw-r--r-- | global/overlay/etc/puppet/facter/cosmos.rb | 22 | ||||
-rw-r--r-- | global/overlay/etc/puppet/puppet.conf | 4 | ||||
-rw-r--r-- | global/overlay/etc/scriptherder/check/cosmos.ini | 3 | ||||
-rwxr-xr-x | global/overlay/usr/local/bin/scriptherder | 302 | ||||
-rwxr-xr-x | global/post-tasks.d/015cosmos-trust | 7 |
7 files changed, 276 insertions, 74 deletions
diff --git a/global/overlay/etc/cron.d/cosmos b/global/overlay/etc/cron.d/cosmos index 70af3a4..58b45af 100644 --- a/global/overlay/etc/cron.d/cosmos +++ b/global/overlay/etc/cron.d/cosmos @@ -1,4 +1,4 @@ SHELL=/bin/sh PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin -*/15 * * * * root test -f /etc/no-automatic-cosmos || (cosmos update ; cosmos apply) +*/15 * * * * root test -f /etc/no-automatic-cosmos || scriptherder --mode wrap --syslog --name cosmos -- /usr/local/bin/run-cosmos -v diff --git a/global/overlay/etc/cron.daily/scriptherder_cleanup b/global/overlay/etc/cron.daily/scriptherder_cleanup new file mode 100755 index 0000000..08ec7f5 --- /dev/null +++ b/global/overlay/etc/cron.daily/scriptherder_cleanup @@ -0,0 +1,10 @@ +#!/bin/sh +# +# Remove scriptherder data older than 7 days. +# + +DIR="/var/cache/scriptherder/" + +test -d ${DIR} || exit 0 + +find ${DIR} -type f -mtime +7 -print0 | xargs -0 rm -f diff --git a/global/overlay/etc/puppet/facter/cosmos.rb b/global/overlay/etc/puppet/facter/cosmos.rb new file mode 100644 index 0000000..d810082 --- /dev/null +++ b/global/overlay/etc/puppet/facter/cosmos.rb @@ -0,0 +1,22 @@ +# +# Extract local Cosmos configuration +# +require 'facter' +Facter.add(:cosmos_repo) do + setcode do + Facter::Util::Resolution.exec("sh -c '. /etc/cosmos/cosmos.conf && echo $COSMOS_REPO'") + end +end + +Facter.add(:cosmos_tag_pattern) do + setcode do + Facter::Util::Resolution.exec("sh -c '. /etc/cosmos/cosmos.conf && echo $COSMOS_UPDATE_VERIFY_GIT_TAG_PATTERN'") + end +end + +Facter.add(:cosmos_repo_origin_url) do + setcode do + Facter::Util::Resolution.exec("sh -c '. /etc/cosmos/cosmos.conf && cd $COSMOS_REPO && git remote show -n origin | grep \"Fetch URL\" | awk \"{print \\$NF }\"'") + end +end + diff --git a/global/overlay/etc/puppet/puppet.conf b/global/overlay/etc/puppet/puppet.conf index a269892..96f7b44 100644 --- a/global/overlay/etc/puppet/puppet.conf +++ b/global/overlay/etc/puppet/puppet.conf @@ -3,7 +3,9 @@ logdir=/var/log/puppet vardir=/var/lib/puppet ssldir=/var/lib/puppet/ssl rundir=/var/run/puppet -factpath=$vardir/lib/facter +# factpath is supposed to be colon-delimeted, but that does not appear to work +# (tested with 'strace -f facter --puppet something' - does not split on colon in Puppet 3.4.2). +factpath=/etc/puppet/facter templatedir=$confdir/templates node_terminus = exec external_nodes = /etc/puppet/cosmos_enc.py diff --git a/global/overlay/etc/scriptherder/check/cosmos.ini b/global/overlay/etc/scriptherder/check/cosmos.ini new file mode 100644 index 0000000..b44a537 --- /dev/null +++ b/global/overlay/etc/scriptherder/check/cosmos.ini @@ -0,0 +1,3 @@ +[check] +ok = exit_status=0, max_age=35m +warning = exit_status=0, max_age=1h diff --git a/global/overlay/usr/local/bin/scriptherder b/global/overlay/usr/local/bin/scriptherder index c11383a..1e00ec0 100755 --- a/global/overlay/usr/local/bin/scriptherder +++ b/global/overlay/usr/local/bin/scriptherder @@ -68,6 +68,28 @@ exit_status = {'OK': 0, } +class ScriptHerderError(Exception): + """ + Base exception class for scriptherder. + """ + + def __init__(self, reason, filename): + self.reason = reason + self.filename = filename + + +class JobLoadError(ScriptHerderError): + """ + Raised when loading a job file fails. + """ + + +class CheckLoadError(ScriptHerderError): + """ + Raised when loading a check file fails. + """ + + class Job(object): """ Representation of an execution of a job. @@ -109,6 +131,21 @@ class Job(object): exit = self.exit_status, ) + def status_summary(self): + """ + Return short string with status of job. + + E.g. 'name[exit=0,age=19h]' + """ + if self._end_time is None or self._start_time is None: + return '{name}[not_running]'.format(name = self.name) + age = _time_to_str(time.time() - self._start_time) + return '{name}[exit={exit_status},age={age}]'.format( + name = self.name, + exit_status = self._exit_status, + age = age, + ) + @property def name(self): """ @@ -167,11 +204,10 @@ class Job(object): @rtype: string """ + if self._end_time is None or self._start_time is None: + return 'NaN' duration = self._end_time - self._start_time - if duration < 1: - # milliseconds - return '{:0.3f}ms'.format(duration * 1000) - return '{:0.3f}s'.format(duration) + return _time_to_str(duration) @property def exit_status(self): @@ -326,7 +362,7 @@ class Job(object): #self._output_size = data.get('output_size') # currently not used in scriptherder self._filename = filename else: - raise AssertionError('Unknown version in file {!r}: {!r}'.format(filename, data.get('version'))) + raise JobLoadError('Unknown version: {!r}'.format(data.get('version')), filename=filename) return self @@ -355,7 +391,7 @@ class Check(object): self.logger = logger self.config = ConfigParser.ConfigParser(_check_defaults) if not self.config.read([filename]): - raise ValueError("Failed loading config file {!r}".format(filename)) + raise ScriptHerderError('Failed loading config file', filename) _section = 'check' self._ok_criteria = [x.strip() for x in self.config.get(_section, 'ok').split(',')] self._warning_criteria = [x.strip() for x in self.config.get(_section, 'warning').split(',')] @@ -422,6 +458,130 @@ class Check(object): return False +class CheckStatus(object): + """ + Aggregated status of job invocations for --mode check. + + Attributes: + + checks_ok: List of checks in OK state ([Job()]). + checks_warning: List of checks in WARNING state ([Job()]). + checks_critical: List of checks in CRITICAL state ([Job()]). + """ + + def __init__(self, args, logger): + """ + @param args: Parsed command line arguments + @param logger: logging logger + """ + + self.checks_ok = [] + self.checks_warning = [] + self.checks_critical = [] + + self._jobs = _get_job_results(args, logger) + # group the jobs by their name + _by_name = {} + for this in self._jobs: + if this.name not in _by_name: + _by_name[this.name] = [] + _by_name[this.name].append(this) + self._jobs_by_name = _by_name + + self._job_count = len(_by_name) + + self._check_running_jobs(args, logger) + if not args.cmd: + self._check_not_running(args, logger) + + def _check_running_jobs(self, args, logger): + """ + Look for job execution entrys (parsed into Job() instances), group them + per check name and determine the status. For each group, append status + to one of the three aggregate status lists of this object (checks_ok, + checks_warning or checks_critical). + + @param args: Parsed command line arguments + @param logger: logging logger + """ + # determine total check status based on all logged invocations of this job + for (name, jobs) in self._jobs_by_name.items(): + # Load the evaluation criterias for this job + check_filename = os.path.join(args.checkdir, name + '.ini') + logger.debug("Loading check definition from {!r}".format(check_filename)) + try: + check = Check(check_filename, logger) + except ScriptHerderError as exc: + logger.warning("Failed loading check: {!r}".format(exc), exc_info=True) + raise CheckLoadError('Failed loading check', filename = check_filename) + + # Sort jobs, oldest first + jobs = sorted(jobs, key=lambda x: x.start_time) + logger.debug("Checking {!r}: {!r}".format(name, jobs)) + + jobs_ok = [] + jobs_warning = [] + jobs_critical = [] + for job in jobs: + if check.job_is_ok(job): + jobs_ok.append(job) + elif check.job_is_warning(job): + jobs_warning.append(job) + else: + jobs_critical.append(job) + + logger.debug("Raw status OK : {!r}".format(jobs_ok)) + logger.debug("Raw status WARN : {!r}".format(jobs_warning)) + logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical)) + + # add most recent job status to the totals + if jobs_ok: + self.checks_ok.append(jobs_ok[-1]) + elif jobs_warning: + self.checks_warning.append(jobs_warning[-1]) + else: + self.checks_critical.append(jobs_critical[-1]) + + def _check_not_running(self, args, logger): + """ + Look for job execution entrys (parsed into Job() instances), group them + per check name and determine the status. For each group, append status + to one of the three aggregate status lists of this object (checks_ok, + checks_warning or checks_critical). + + @param args: Parsed command line arguments + @param logger: logging logger + """ + files = [f for f in os.listdir(args.checkdir) if os.path.isfile(os.path.join(args.checkdir, f))] + for this in files: + if not this.endswith('.ini'): + continue + filename = os.path.join(args.checkdir, this) + logger.debug("Loading check definition from {!r}".format(filename)) + try: + # validate check loads + Check(filename, logger) + except ValueError as exc: + logger.warning("Failed loading check: {!r}".format(exc), exc_info=True) + raise CheckLoadError(filename = filename) + name = this[:-4] # remove the '.ini' suffix + if name not in self._jobs_by_name: + logger.debug('Check {!r} (filename {!r}) not found in jobs'.format(name, filename)) + job = Job(name=name) + self.checks_critical.append(job) + self._job_count += 1 + else: + logger.debug('Check {!r} has {!r} logged results'.format(name, len(self._jobs_by_name[name]))) + + def num_jobs(self): + """ + Return number of jobs processed. This is number of different jobs running + not running. + + @rtype: int + """ + return self._job_count + + def job_from_file(filename): """ Recreate Job() instance from saved file. @@ -488,6 +648,7 @@ def parse_args(defaults): ) args = parser.parse_args() + return args @@ -537,88 +698,61 @@ def mode_check(args, logger): @param args: Parsed command line arguments @param logger: logging logger """ - jobs = _get_job_results(args, logger) - # group the jobs by their name - by_name = {} - for this in jobs: - if this.name not in by_name: - by_name[this.name] = [] - by_name[this.name].append(this) - - total_ok = [] - total_warning = [] - total_critical = [] - - # determine total check status based on all logged invocations of this job - for (name, jobs) in by_name.items(): - # Sort jobs, oldest first - jobs = sorted(jobs, key=lambda x: x.start_time) - # Load the evaluation criterias for this job - check_filename = os.path.join(args.checkdir, name + '.ini') - logger.debug("Loading check definition from {!r}".format(check_filename)) - check = Check(check_filename, logger) - logger.debug("Checking {!r}: {!r}".format(name, jobs)) - - jobs_ok = [] - jobs_warning = [] - jobs_critical = [] - for job in jobs: - if check.job_is_ok(job): - jobs_ok.append(job) - elif check.job_is_warning(job): - jobs_warning.append(job) - else: - jobs_critical.append(job) - logger.debug("Raw status OK : {!r}".format(jobs_ok)) - logger.debug("Raw status WARN : {!r}".format(jobs_warning)) - logger.debug("Raw status CRITICAL: {!r}".format(jobs_critical)) - if jobs_ok: - total_ok.append(jobs_ok[-1]) - elif jobs_warning: - total_warning.append(jobs_warning[-1]) - else: - total_critical.append(jobs_critical[-1]) + try: + status = CheckStatus(args, logger) + except CheckLoadError as exc: + print("UNKNOWN: Failed loading check from file '{!s}' ({!s})".format(exc.filename, exc.reason)) + return exit_status['UNKNOWN'] if args.cmd: # Single job check requested, output detailed information - if total_ok: - print('OK: {!s}'.format(total_ok[-1])) + if status.checks_ok: + print('OK: {!s}'.format(status.checks_ok[-1])) return exit_status['OK'] - if total_warning: - print('WARNING: {!s}'.format(total_warning[-1])) + if status.checks_warning: + print('WARNING: {!s}'.format(status.checks_warning[-1])) return exit_status['WARNING'] - if total_critical: - print('CRITICAL: {!s}'.format(total_critical[-1])) + if status.checks_critical: + print('CRITICAL: {!s}'.format(status.checks_critical[-1])) return exit_status['CRITICAL'] print "UNKNOWN - no jobs found for {!r}?".format(args.cmd) return exit_status['UNKNOWN'] - # When not looking at multiple jobs at once, logic gets a bit reversed - if ANY + # When looking at multiple jobs at once, logic gets a bit reversed - if ANY # job invocation is CRITICAL/WARNING, the aggregate message given to # Nagios will have to be a failure. - if total_critical: - print("CRITICAL: {num} job(s) in this state: {names}".format( - num = len(total_critical), - names = ', '.join([str(x.name) for x in total_critical]), - )) + if status.checks_critical: + print('CRITICAL: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_critical))) return exit_status['CRITICAL'] - if total_warning: - print("WARNING: {num} job(s) in this state: {names}".format( - num = len(total_warning), - names = ', '.join([str(x.name) for x in total_warning]), - )) + if status.checks_warning: + print('WARNING: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_warning))) return exit_status['WARNING'] - if total_ok: - print("OK: {num} job(s) in this state: {names}".format( - num = len(total_ok), - names = ', '.join([x.name for x in total_ok]), - )) + if status.checks_ok: + print('OK: {!s}'.format( + _status_summary(status.num_jobs(), status.checks_ok))) return exit_status['OK'] print "UNKNOWN - no jobs found?" return exit_status['UNKNOWN'] +def _status_summary(num_jobs, failed): + """ + String format routine used in output of checks status. + """ + fmt = '1 job in this state: {summary}' + if len(failed) == 1: + fmt = '{jobs}/{num_jobs} job in this state: {summary}' + + summary = ', '.join(sorted([str(x.status_summary()) for x in failed])) + return fmt.format(jobs = len(failed), + num_jobs = num_jobs, + summary = summary, + ) + + def _get_job_results(args, logger): """ Load all jobs matching any specified name on the command line. @@ -634,7 +768,10 @@ def _get_job_results(args, logger): if not this.endswith('.json'): continue filename = os.path.join(args.datadir, this) - job = job_from_file(filename) + try: + job = job_from_file(filename) + except JobLoadError as exc: + logger.warning("Failed loading job file '{!s}' ({!s})".format(exc.filename, exc.reason)) if args.cmd: if args.cmd[0] != job.name: logger.debug("Skipping '{!s}' not matching '{!s}' (file {!s})".format(job.name, args.cmd[0], filename)) @@ -666,6 +803,27 @@ def _parse_time_value(value): return num +def _time_to_str(value): + """ + Format number of seconds to short readable string. + + @type value: float or int + + @rtype: string + """ + if value < 1: + # milliseconds + return '{:0.3f}ms'.format(value * 1000) + if value < 60: + return '{!s}s'.format(int(value)) + if value < 3600: + return '{!s}m'.format(int(value)) + if value < 86400: + return '{!s}h'.format(int(value / 3600)) + days = int(value / 86400) + return '{!s}d{!s}h'.format(days, int((value % 86400) / 3600)) + + def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults): """ Main entry point for either wrapping a script, or checking the status of it. @@ -699,6 +857,10 @@ def main(myname = 'scriptherder', args = None, logger = None, defaults=_defaults syslog_h.setFormatter(formatter) logger.addHandler(syslog_h) + if args.name and args.mode != 'wrap': + logger.error('Argument --name only applicable for --mode wrap') + return False + if args.mode == 'wrap': return mode_wrap(args, logger) elif args.mode == 'ls': diff --git a/global/post-tasks.d/015cosmos-trust b/global/post-tasks.d/015cosmos-trust index 447d875..5c3359b 100755 --- a/global/post-tasks.d/015cosmos-trust +++ b/global/post-tasks.d/015cosmos-trust @@ -4,12 +4,15 @@ if [ -z "$COSMOS_KEYS" ]; then COSMOS_KEYS=/etc/cosmos/keys fi +# Install new keys discovered in the $COSMOS_KEYS directory for k in $COSMOS_KEYS/*.pub; do fp=`cosmos gpg --with-colons --with-fingerprint < $k| awk -F: '$1 == "pub" {print $5}'` - cosmos gpg --with-colons --fingerprint | grep -q ":$fp:" || cosmos gpg --import < $k + # The removal of any ^pub:e: entrys means to ignore expired keys - thereby importing them again. + cosmos gpg --with-colons --fingerprint | grep -v "^pub:e:" | grep -q ":$fp:" || cosmos gpg --import < $k done -for fp in `cosmos gpg --with-colons --fingerprint | awk -F: '$1 == "pub" {print $5}'`; do +# Delete keys no longer present in $COSMOS_KEYS directory +for fp in `cosmos gpg --with-colons --fingerprint | awk -F: '$1 == "pub" {print $5 }'`; do seen="no" for k in $COSMOS_KEYS/*.pub; do cosmos gpg --with-colons --with-fingerprint < $k | grep -q ":$fp:" && seen="yes" |