207 lines
6.5 KiB
Django/Jinja
207 lines
6.5 KiB
Django/Jinja
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
from datetime import datetime, timezone
|
|
import subprocess
|
|
from os import path, stat, listdir
|
|
import sys
|
|
import json
|
|
|
|
from loguru import logger
|
|
|
|
import sudoisinflux
|
|
|
|
parser = argparse.ArgumentParser(
|
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
description="rsnapshot+metrics"
|
|
)
|
|
parser.add_argument("action", choices=["metrics", "backup"])
|
|
parser.add_argument("-q", "--quiet", action="store_true", help="dont print to stdout/stderr")
|
|
parser.add_argument("--interval", default="alpha", help="rsnapshot interval")
|
|
parser.add_argument("--debug", action="store_true", help="debug logs")
|
|
parser.add_argument("--logfile", default="/var/log/sudoisrsnapshot.log")
|
|
parser.add_argument("--force-yes-i-know-what-im-doing", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
rsnapshot_bin = "/usr/bin/rsnapshot"
|
|
date_bin = "/usr/bin/date"
|
|
|
|
host_marker_name = ".backup_age_marker"
|
|
successful_backup_marker_path = "/root/.last_succesful_backup"
|
|
sdfbackup_json_path = "/root/.sdfbackup.json"
|
|
|
|
latest_path = path.join("{{ rsnapshot_root }}", f"{args.interval}.0")
|
|
|
|
logger.remove()
|
|
|
|
if args.debug:
|
|
logger.add(sys.stdout, level="DEBUG")
|
|
logger.success("--debug: verbose output, not writing to logfile")
|
|
|
|
else:
|
|
if not args.quiet:
|
|
logger.add(sys.stdout, level="INFO")
|
|
logger.add(args.logfile, level="INFO")
|
|
|
|
def utc_now():
|
|
return datetime.now(tz=timezone.utc)
|
|
|
|
def utc_isoformat():
|
|
return utc_now().isoformat()
|
|
|
|
def write_marker(marker_path):
|
|
with open(marker_path, 'w') as f:
|
|
f.write(utc_isoformat() + "\n")
|
|
|
|
def get_host_marker_age(hostname):
|
|
host_marker = path.join(latest_path, hostname, "root", host_marker_name)
|
|
return get_marker_age(host_marker)
|
|
|
|
def get_marker_age(marker_path):
|
|
with open(marker_path, 'r') as f:
|
|
# i dont remember why i didnt use mtime instead of reading
|
|
# a ISO formatted datestamp string from a file....
|
|
# that would have been easier i think.
|
|
marker_utc = datetime.fromisoformat(f.read().strip())
|
|
|
|
marker_age = utc_now() - marker_utc
|
|
return marker_age
|
|
|
|
def rsnapshot():
|
|
"""runs rsnapshot.
|
|
|
|
returns subprocess.CompletedProcess
|
|
"""
|
|
|
|
cmd = [rsnapshot_bin, args.interval]
|
|
|
|
# dont raise exception if rsnapshot fails
|
|
ps = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
|
return ps
|
|
|
|
|
|
def read_all_host_markers():
|
|
"""gets the timestamp from inside of each hosts' backup
|
|
|
|
made into a wrapper functoin because that made error handling cleaner
|
|
"""
|
|
|
|
now = utc_now()
|
|
backed_up_hosts = [a for a in listdir(latest_path) if a[0].isalpha()]
|
|
for host in backed_up_hosts:
|
|
try:
|
|
marker_age = get_host_marker_age(host)
|
|
yield host, marker_age
|
|
except (FileNotFoundError, ValueError) as e:
|
|
logger.error(f"backup age marker missing/broken for {host}: {e}")
|
|
yield host, None
|
|
|
|
def send_metrics():
|
|
all_backup_subdirs = read_all_host_markers()
|
|
|
|
# all backups/hosts with a backup marker file
|
|
backups = [a for a in all_backup_subdirs if a[1] is not None]
|
|
# there should be no backups/hosts without a marker file, this
|
|
# should always be 0
|
|
backups_without_marker = [a for a in all_backup_subdirs if a[1] is None]
|
|
|
|
with open(sdfbackup_json_path, "r") as f:
|
|
sdfbackup = json.loads(f.read())
|
|
|
|
|
|
try:
|
|
last_successful_backup = get_marker_age(successful_backup_marker_path)
|
|
|
|
last_successful = {
|
|
'tags': {
|
|
'host': "{{ inventory_hostname }}",
|
|
'interval': args.interval
|
|
},
|
|
'measurement': 'rsnapshot_job',
|
|
'time': utc_isoformat(),
|
|
'fields': {
|
|
'seconds_since_last_successful_run': int(last_successful_backup.total_seconds()),
|
|
'backups_count': len(backups),
|
|
'backups_without_marker_count': len(backups_without_marker),
|
|
'sdf_used_percent': sdfbackup['used_percent'],
|
|
'sdf_avail_gb': sdfbackup['avail_gb'],
|
|
'sdf_used_gb': sdfbackup['used_gb']
|
|
}
|
|
}
|
|
sudoisinflux.write(last_successful, precision='h')
|
|
logger.debug(f"influxdb.write: {json.dumps(last_successful, indent=2)}")
|
|
|
|
except FileNotFoundError as e:
|
|
# if theres no marker for a succesful backup, it probably
|
|
# hasnt been taken yet, so dont send a datapoint about a successful
|
|
# backup that hasnt happened
|
|
logger.error(e)
|
|
|
|
|
|
for host, marker_age in read_all_host_markers():
|
|
# send one datapoint for each host that we have a backup directory for
|
|
|
|
age_seconds = int(marker_age.total_seconds())
|
|
datapoint = {
|
|
'tags': {
|
|
'host': "{{ inventory_hostname }}",
|
|
'interval': args.interval,
|
|
'backup_name': host,
|
|
},
|
|
'measurement': 'rsnapshot_backup',
|
|
'time': utc_isoformat(),
|
|
'fields': {
|
|
'backup_age_seconds': age_seconds,
|
|
}
|
|
}
|
|
sudoisinflux.write(datapoint, precision='h')
|
|
logger.debug(f"influxdb.write: {json.dumps(datapoint, indent=2)}")
|
|
|
|
# log a warning if its been missing one day, then start logging errors
|
|
if marker_age.days > 0:
|
|
logger.error(f"backup for {host} is {marker_age.days}d old")
|
|
else:
|
|
logger.debug(f"{host}: {age_seconds//60}m")
|
|
|
|
|
|
@logger.catch
|
|
def main():
|
|
if not path.exists(latest_path):
|
|
logger.error(f"path '{latest_path}' doesnt exist")
|
|
sys.exit(2)
|
|
|
|
if args.action == "metrics":
|
|
send_metrics()
|
|
sys.exit(0)
|
|
|
|
elif args.action == "backup":
|
|
logger.debug("starting rsnapshot ...")
|
|
|
|
ps = rsnapshot()
|
|
|
|
# send the metrics asap, prints names of backups without marker files (if any)
|
|
send_metrics()
|
|
|
|
level = "DEBUG" if ps.returncode == 0 else "ERROR"
|
|
logger.log(level, f"`rsnapshot` returned status: '{ps.returncode}'")
|
|
|
|
# update the marker for a succesful backup only if rsnapshot
|
|
# exits without errors
|
|
if ps.returncode == 0:
|
|
write_marker(successful_backup_marker_path)
|
|
|
|
if len(ps.stdout) > 0:
|
|
logger.warning(f"rsnapshot stdout: \n{ps.stdout}")
|
|
if len(ps.stderr) > 0:
|
|
logger.warning(f"rsnapshot stderr: \n{ps.stderr}")
|
|
|
|
# exit with the same returncode as `rsnapshot`
|
|
sys.exit(ps.returncode)
|
|
|
|
else:
|
|
raise SystemExit(f"invalid action: '{args.action}'")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|