PY-L7-44 · Project: Server Maintenance Toolkit

The Brief

3 min

Build servertool.py, a git-style CLI for remote server admin over SSH/SFTP:

status <host> — health report (uptime, disk, memory, services).
backup <host> <remote-path> — pull a remote folder/DB down to local, verified.
restart <host> <service> — restart a service (destructive → needs confirmation).
deploy <host> — run an ordered runbook, stopping on first failure.

It must support --dry-run, confirm before anything destructive, log every action, and never hard-code credentials.

Safety First

5 min

⚠️ This tool can break production

Restarting a service or deleting files on a live server is high-stakes. Build safety in from line one: dry-run by default for destructive commands (show what would happen), require explicit confirmation or a --yes flag to actually do it, log everything (who ran what, when, the result), and operate only on servers you're authorised to manage. The cost of an accidental restart is real downtime.

This is where Level 7's "execute actions with care" principle becomes concrete: reversible reads (status) are free; irreversible writes (restart, delete) demand confirmation.

Build It · The SSH Core & Status

14 min

Shared connection helpers

import os, logging
from contextlib import contextmanager
import paramiko

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s",
                    datefmt="%H:%M:%S")
log = logging.getLogger("servertool")

@contextmanager
def connect(host: str):
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    client.set_missing_host_key_policy(paramiko.RejectPolicy())
    client.connect(host, username=os.environ["SSH_USER"],
                   key_filename=os.environ["SSH_KEY"], timeout=15)
    try:
        yield client
    finally:
        client.close()

def run(client, cmd: str) -> tuple[int, str, str]:
    _, out, err = client.exec_command(cmd, timeout=60)
    return out.channel.recv_exit_status(), out.read().decode(), err.read().decode()

The status command (safe, read-only)

def cmd_status(args):
    with connect(args.host) as ssh:
        checks = {
            "uptime":  "uptime -p",
            "disk":    "df -h / | tail -1 | awk '{print $5\" used\"}'",
            "memory":  "free -m | awk '/Mem:/ {printf \"%d%% used\", $3/$2*100}'",
            "load":    "cat /proc/loadavg | awk '{print $1, $2, $3}'",
        }
        print(f"=== {args.host} ===")
        for label, cmd in checks.items():
            code, out, _ = run(ssh, cmd)
            print(f"  {label:8} {out.strip() if code == 0 else 'ERROR'}")
        # service states
        for svc in args.services or []:
            code, out, _ = run(ssh, f"systemctl is-active {svc}")
            print(f"  {svc:8} {out.strip()}")

status only reads — it's always safe to run, needs no confirmation, and reuses the health ideas from Lesson 43 but over SSH (Lesson 41).

Build It · Destructive Commands & Wiring

12 min

Restart — guarded

def confirm(action: str, dry_run: bool, assume_yes: bool) -> bool:
    if dry_run:
        log.info("[dry-run] would: %s", action)
        return False
    if assume_yes:
        return True
    answer = input(f"About to {action}. Type 'yes' to proceed: ")
    return answer.strip().lower() == "yes"

def cmd_restart(args):
    action = f"restart {args.service} on {args.host}"
    if not confirm(action, args.dry_run, args.yes):
        log.info("aborted: %s", action)
        return
    with connect(args.host) as ssh:
        code, out, err = run(ssh, f"sudo systemctl restart {args.service}")
        if code == 0:
            log.info("restarted %s ✅", args.service)
            # verify it came back up
            _, state, _ = run(ssh, f"systemctl is-active {args.service}")
            log.info("%s is now: %s", args.service, state.strip())
        else:
            log.error("restart failed: %s", err.strip())

The confirm gate is the heart of safety: dry-run shows intent without acting, --yes allows automation, and otherwise it asks a human. After restarting, it verifies the service actually came back — a restart that kills a service silently is worse than no restart.

Backup — pull & verify (SFTP)

from pathlib import Path

def cmd_backup(args):
    local_dir = Path("server-backups") / args.host
    local_dir.mkdir(parents=True, exist_ok=True)
    with connect(args.host) as ssh:
        sftp = ssh.open_sftp()
        remote = args.remote_path
        name = remote.rstrip("/").split("/")[-1]
        target = local_dir / name
        if args.dry_run:
            log.info("[dry-run] would download %s → %s", remote, target)
            return
        sftp.get(remote, str(target))
        remote_size = sftp.stat(remote).st_size
        if target.stat().st_size != remote_size:
            log.error("size mismatch — backup incomplete"); return
        log.info("backed up %s → %s (%.1f KB)", remote, target,
                 target.stat().st_size / 1000)
        sftp.close()

Wire it up

import argparse

def main():
    p = argparse.ArgumentParser(description="Remote server maintenance.")
    p.add_argument("--dry-run", action="store_true",
                   help="show actions without performing them")
    p.add_argument("--yes", action="store_true",
                   help="skip confirmation prompts (for automation)")
    sub = p.add_subparsers(dest="cmd", required=True)

    s = sub.add_parser("status"); s.add_argument("host")
    s.add_argument("--services", nargs="*"); s.set_defaults(func=cmd_status)

    b = sub.add_parser("backup"); b.add_argument("host")
    b.add_argument("remote_path"); b.set_defaults(func=cmd_backup)

    r = sub.add_parser("restart"); r.add_argument("host")
    r.add_argument("service"); r.set_defaults(func=cmd_restart)

    args = p.parse_args()
    args.func(args)

if __name__ == "__main__":
    main()

$ python servertool.py status web-01 --services nginx myapp
=== web-01 ===
  uptime   up 12 days, 4 hours
  disk     63% used
  memory   41% used
  load     0.12 0.18 0.21
  nginx    active
  myapp    active

$ python servertool.py --dry-run restart web-01 nginx
[dry-run] would: restart nginx on web-01

$ python servertool.py restart web-01 nginx
About to restart nginx on web-01. Type 'yes' to proceed: yes
restarted nginx ✅
nginx is now: active

Read the result

The toolkit is the SSH/SFTP foundation (Lessons 41-42) wrapped in the subcommand CLI pattern (Lesson 4), with a single uncompromising safety layer: status/backup-dry-run are free to run, but restart passes through confirm — dry-run by default in spirit, explicit human yes or --yes to act — and then verifies the outcome. Every action is logged. This is how real ops tooling is built: powerful, but impossible to fire by accident.

Build It Yourself

13 min

Target a VM, Pi, or localhost SSH. Use a harmless "service" for restart testing (e.g. a dummy script you can start/stop) so you never disrupt anything real.

01 🟢 Status first

Get status working: connect, run the read-only checks, and print a clean report. Confirm it never modifies anything.

02 🟡 Guarded restart

Implement restart with the confirm gate. Test all three paths: --dry-run (shows, doesn't act), no flag (prompts), --yes (acts immediately). Verify the service state after.

03 🔴 Add deploy

Add a deploy subcommand that runs an ordered runbook (Lesson 41's run_steps): pull code, install deps, restart, health-check — stopping and reporting on the first failure. Make it respect --dry-run.

Hint

STEPS = [
    "cd /app && git pull",
    "cd /app && pip install -r requirements.txt",
    "sudo systemctl restart myapp",
    "curl -fsS http://localhost/health",
]
def cmd_deploy(args):
    with connect(args.host) as ssh:
        for i, step in enumerate(STEPS, 1):
            if args.dry_run:
                log.info("[dry-run] step %d: %s", i, step); continue
            code, out, err = run(ssh, step)
            if code != 0:
                log.error("step %d FAILED: %s\n%s", i, step, err.strip())
                return
            log.info("step %d ok", i)

Stretch · Fleet Mode & Notifications

8 min

Make commands work across many servers from a hosts file, running in sequence with a per-host summary, and post the results to Slack (Lesson 31). For deploy, add a "stop the rollout if a host fails" option. This turns a single-box tool into fleet management.

Show the key additions

def cmd_status_all(args):
    hosts = Path(args.hosts_file).read_text().split()
    results = {}
    for host in hosts:
        try:
            with connect(host) as ssh:
                _, disk, _ = run(ssh, "df -h / | tail -1 | awk '{print $5}'")
                results[host] = f"disk {disk.strip()}"
        except Exception as e:
            results[host] = f"UNREACHABLE: {e}"
    summary = "\n".join(f"{h}: {r}" for h, r in results.items())
    print(summary)
    from notify import notify          # Lesson 31
    notify("Fleet status:\n" + summary)

Non-negotiables: iterate hosts from a file, per-host summary, Slack post, optional stop-on-failure for deploy.

Recap

3 min

The maintenance toolkit composes SSH command execution (41), SFTP transfer (42), and health checks (43) into a subcommand CLI (4) — but its defining feature is safety. Read-only commands (status) run freely; destructive ones (restart, deploy, deletes) pass through a confirm gate that supports --dry-run (show, don't act), human confirmation, and --yes for automation — then verify the result and log everything. That discipline — match the caution to the blast radius — is exactly what separates a dangerous script from a trustworthy ops tool, and it carries straight into the capstone.

Vocabulary Card

dry run: Showing what an action would do without performing it.
confirmation gate: Requiring explicit human (or --yes) approval before a destructive action.
runbook: An ordered list of steps to perform an operation (e.g. deploy).
blast radius: How much can break if an action goes wrong — guides how much caution to apply.

Homework

4 min

Finish servertool.py with at least status, backup, and restart — full dry-run + confirmation safety, result verification, and logging — tested against a VM/Pi/localhost. Add one stretch feature (deploy runbook, fleet mode, or Slack reporting). Write a short ops note: which commands are safe vs. guarded, and the exact safeguards on the destructive ones.

Sample · ops note

servertool.py — command safety classification

SAFE (read-only, no confirmation):
  status <host>            reads uptime/disk/mem/services
  backup <host> <path>     downloads (never modifies the server)

GUARDED (destructive, confirmation required):
  restart <host> <svc>     → confirm() gate:
      --dry-run  → prints "would restart", does nothing
      (default)  → prompts "type yes to proceed"
      --yes      → proceeds (for cron/automation)
      after acting, verifies 'systemctl is-active' = active
  deploy <host>            → runbook, stops on first failed step

Every invocation logs host + action + result to servertool.log.
Credentials (SSH_USER/SSH_KEY) come from .env; nothing hard-coded.

Non-negotiables: ≥3 commands, dry-run+confirm on destructive ones, result verification, logging, one stretch feature, the ops note.

import os, logging from contextlib import contextmanager import paramiko logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S") log = logging.getLogger("servertool") @contextmanager def connect(host: str): client = paramiko.SSHClient() client.load_system_host_keys() client.set_missing_host_key_policy(paramiko.RejectPolicy()) client.connect(host, username=os.environ["SSH_USER"], key_filename=os.environ["SSH_KEY"], timeout=15) try: yield client finally: client.close() def run(client, cmd: str) -> tuple[int, str, str]: _, out, err = client.exec_command(cmd, timeout=60) return out.channel.recv_exit_status(), out.read().decode(), err.read().decode()

def cmd_status(args): with connect(args.host) as ssh: checks = { "uptime": "uptime -p", "disk": "df -h / | tail -1 | awk '{print $5\" used\"}'", "memory": "free -m | awk '/Mem:/ {printf \"%d%% used\", $3/$2*100}'", "load": "cat /proc/loadavg | awk '{print $1, $2, $3}'", } print(f"=== {args.host} ===") for label, cmd in checks.items(): code, out, _ = run(ssh, cmd) print(f" {label:8} {out.strip() if code == 0 else 'ERROR'}") # service states for svc in args.services or []: code, out, _ = run(ssh, f"systemctl is-active {svc}") print(f" {svc:8} {out.strip()}")

def confirm(action: str, dry_run: bool, assume_yes: bool) -> bool: if dry_run: log.info("[dry-run] would: %s", action) return False if assume_yes: return True answer = input(f"About to {action}. Type 'yes' to proceed: ") return answer.strip().lower() == "yes" def cmd_restart(args): action = f"restart {args.service} on {args.host}" if not confirm(action, args.dry_run, args.yes): log.info("aborted: %s", action) return with connect(args.host) as ssh: code, out, err = run(ssh, f"sudo systemctl restart {args.service}") if code == 0: log.info("restarted %s ✅", args.service) # verify it came back up _, state, _ = run(ssh, f"systemctl is-active {args.service}") log.info("%s is now: %s", args.service, state.strip()) else: log.error("restart failed: %s", err.strip())

from pathlib import Path def cmd_backup(args): local_dir = Path("server-backups") / args.host local_dir.mkdir(parents=True, exist_ok=True) with connect(args.host) as ssh: sftp = ssh.open_sftp() remote = args.remote_path name = remote.rstrip("/").split("/")[-1] target = local_dir / name if args.dry_run: log.info("[dry-run] would download %s → %s", remote, target) return sftp.get(remote, str(target)) remote_size = sftp.stat(remote).st_size if target.stat().st_size != remote_size: log.error("size mismatch — backup incomplete"); return log.info("backed up %s → %s (%.1f KB)", remote, target, target.stat().st_size / 1000) sftp.close()

import argparse def main(): p = argparse.ArgumentParser(description="Remote server maintenance.") p.add_argument("--dry-run", action="store_true", help="show actions without performing them") p.add_argument("--yes", action="store_true", help="skip confirmation prompts (for automation)") sub = p.add_subparsers(dest="cmd", required=True) s = sub.add_parser("status"); s.add_argument("host") s.add_argument("--services", nargs="*"); s.set_defaults(func=cmd_status) b = sub.add_parser("backup"); b.add_argument("host") b.add_argument("remote_path"); b.set_defaults(func=cmd_backup) r = sub.add_parser("restart"); r.add_argument("host") r.add_argument("service"); r.set_defaults(func=cmd_restart) args = p.parse_args() args.func(args) if __name__ == "__main__": main()

$ python servertool.py status web-01 --services nginx myapp === web-01 === uptime up 12 days, 4 hours disk 63% used memory 41% used load 0.12 0.18 0.21 nginx active myapp active $ python servertool.py --dry-run restart web-01 nginx [dry-run] would: restart nginx on web-01 $ python servertool.py restart web-01 nginx About to restart nginx on web-01. Type 'yes' to proceed: yes restarted nginx ✅ nginx is now: active

STEPS = [ "cd /app && git pull", "cd /app && pip install -r requirements.txt", "sudo systemctl restart myapp", "curl -fsS http://localhost/health", ] def cmd_deploy(args): with connect(args.host) as ssh: for i, step in enumerate(STEPS, 1): if args.dry_run: log.info("[dry-run] step %d: %s", i, step); continue code, out, err = run(ssh, step) if code != 0: log.error("step %d FAILED: %s\n%s", i, step, err.strip()) return log.info("step %d ok", i)

def cmd_status_all(args): hosts = Path(args.hosts_file).read_text().split() results = {} for host in hosts: try: with connect(host) as ssh: _, disk, _ = run(ssh, "df -h / | tail -1 | awk '{print $5}'") results[host] = f"disk {disk.strip()}" except Exception as e: results[host] = f"UNREACHABLE: {e}" summary = "\n".join(f"{h}: {r}" for h, r in results.items()) print(summary) from notify import notify # Lesson 31 notify("Fleet status:\n" + summary)