infra/roles/backup/templates/sudoisrsnapshot.py.j2

207 lines
6.5 KiB
Django/Jinja

#!/usr/bin/env python3
import argparse
from datetime import datetime, timezone
import subprocess
from os import path, stat, listdir
import sys
import json
from loguru import logger
import sudoisinflux
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="rsnapshot+metrics"
)
parser.add_argument("action", choices=["metrics", "backup"])
parser.add_argument("-q", "--quiet", action="store_true", help="dont print to stdout/stderr")
parser.add_argument("--interval", default="alpha", help="rsnapshot interval")
parser.add_argument("--debug", action="store_true", help="debug logs")
parser.add_argument("--logfile", default="/var/log/sudoisrsnapshot.log")
parser.add_argument("--force-yes-i-know-what-im-doing", action="store_true")
args = parser.parse_args()
rsnapshot_bin = "/usr/bin/rsnapshot"
date_bin = "/usr/bin/date"
host_marker_name = ".backup_age_marker"
successful_backup_marker_path = "/root/.last_succesful_backup"
sdfbackup_json_path = "/root/.sdfbackup.json"
latest_path = path.join("{{ rsnapshot_root }}", f"{args.interval}.0")
logger.remove()
if args.debug:
logger.add(sys.stdout, level="DEBUG")
logger.success("--debug: verbose output, not writing to logfile")
else:
if not args.quiet:
logger.add(sys.stdout, level="INFO")
logger.add(args.logfile, level="INFO")
def utc_now():
return datetime.now(tz=timezone.utc)
def utc_isoformat():
return utc_now().isoformat()
def write_marker(marker_path):
with open(marker_path, 'w') as f:
f.write(utc_isoformat() + "\n")
def get_host_marker_age(hostname):
host_marker = path.join(latest_path, hostname, "root", host_marker_name)
return get_marker_age(host_marker)
def get_marker_age(marker_path):
with open(marker_path, 'r') as f:
# i dont remember why i didnt use mtime instead of reading
# a ISO formatted datestamp string from a file....
# that would have been easier i think.
marker_utc = datetime.fromisoformat(f.read().strip())
marker_age = utc_now() - marker_utc
return marker_age
def rsnapshot():
"""runs rsnapshot.
returns subprocess.CompletedProcess
"""
cmd = [rsnapshot_bin, args.interval]
# dont raise exception if rsnapshot fails
ps = subprocess.run(cmd, capture_output=True, text=True, check=False)
return ps
def read_all_host_markers():
"""gets the timestamp from inside of each hosts' backup
made into a wrapper functoin because that made error handling cleaner
"""
now = utc_now()
backed_up_hosts = [a for a in listdir(latest_path) if a[0].isalpha()]
for host in backed_up_hosts:
try:
marker_age = get_host_marker_age(host)
yield host, marker_age
except (FileNotFoundError, ValueError) as e:
logger.error(f"backup age marker missing/broken for {host}: {e}")
yield host, None
def send_metrics():
all_backup_subdirs = read_all_host_markers()
# all backups/hosts with a backup marker file
backups = [a for a in all_backup_subdirs if a[1] is not None]
# there should be no backups/hosts without a marker file, this
# should always be 0
backups_without_marker = [a for a in all_backup_subdirs if a[1] is None]
with open(sdfbackup_json_path, "r") as f:
sdfbackup = json.loads(f.read())
try:
last_successful_backup = get_marker_age(successful_backup_marker_path)
last_successful = {
'tags': {
'host': "{{ inventory_hostname }}",
'interval': args.interval
},
'measurement': 'rsnapshot_job',
'time': utc_isoformat(),
'fields': {
'seconds_since_last_successful_run': int(last_successful_backup.total_seconds()),
'backups_count': len(backups),
'backups_without_marker_count': len(backups_without_marker),
'sdf_used_percent': sdfbackup['used_percent'],
'sdf_avail_gb': sdfbackup['avail_gb'],
'sdf_used_gb': sdfbackup['used_gb']
}
}
sudoisinflux.write(last_successful, precision='h')
logger.debug(f"influxdb.write: {json.dumps(last_successful, indent=2)}")
except FileNotFoundError as e:
# if theres no marker for a succesful backup, it probably
# hasnt been taken yet, so dont send a datapoint about a successful
# backup that hasnt happened
logger.error(e)
for host, marker_age in read_all_host_markers():
# send one datapoint for each host that we have a backup directory for
age_seconds = int(marker_age.total_seconds())
datapoint = {
'tags': {
'host': "{{ inventory_hostname }}",
'interval': args.interval,
'backup_name': host,
},
'measurement': 'rsnapshot_backup',
'time': utc_isoformat(),
'fields': {
'backup_age_seconds': age_seconds,
}
}
sudoisinflux.write(datapoint, precision='h')
logger.debug(f"influxdb.write: {json.dumps(datapoint, indent=2)}")
# log a warning if its been missing one day, then start logging errors
if marker_age.days > 0:
logger.error(f"backup for {host} is {marker_age.days}d old")
else:
logger.debug(f"{host}: {age_seconds//60}m")
@logger.catch
def main():
if not path.exists(latest_path):
logger.error(f"path '{latest_path}' doesnt exist")
sys.exit(2)
if args.action == "metrics":
send_metrics()
sys.exit(0)
elif args.action == "backup":
logger.debug("starting rsnapshot ...")
ps = rsnapshot()
# send the metrics asap, prints names of backups without marker files (if any)
send_metrics()
level = "DEBUG" if ps.returncode == 0 else "ERROR"
logger.log(level, f"`rsnapshot` returned status: '{ps.returncode}'")
# update the marker for a succesful backup only if rsnapshot
# exits without errors
if ps.returncode == 0:
write_marker(successful_backup_marker_path)
if len(ps.stdout) > 0:
logger.warning(f"rsnapshot stdout: \n{ps.stdout}")
if len(ps.stderr) > 0:
logger.warning(f"rsnapshot stderr: \n{ps.stderr}")
# exit with the same returncode as `rsnapshot`
sys.exit(ps.returncode)
else:
raise SystemExit(f"invalid action: '{args.action}'")
if __name__ == "__main__":
main()