watch.sh · docker-watcher · main

#!/bin/bash

NTFY_URL="${NTFY_URL:-invalid}"
RECOVERY_WAIT=15  # seconds to wait before declaring recovery

declare -A last_notified   # debounce: last alert time per container
declare -A unhealthy       # track containers we've alerted on

notify() {
  local title="$1"
  local msg="$2"
  local priority="${3:-default}"
  local tags="${4:-docker}"

  curl -s \
    -H "Title: $title" \
    -H "Priority: $priority" \
    -H "Tags: $tags" \
    -d "$msg" \
    "$NTFY_URL"
}

is_running() {
  local name="$1"
  # exits 0 if container exists and is running, 1 otherwise
  docker inspect --format '{{.State.Running}}' "$name" 2>/dev/null | grep -q "true"
}

docker events \
  --filter 'event=die' \
  --filter 'event=oom' \
  --filter 'event=start' \
  --format '{{.Action}}|{{.Actor.Attributes.name}}|{{.Actor.Attributes.exitCode}}' \
| while IFS='|' read -r event_status container_name exit_code; do

  now=$(date +%s)

  case "$event_status" in

    die)
      # Skip clean/intentional stops
      [ "$exit_code" = "0" ] && continue

      # Debounce: skip if we already alerted within 60s
      last=${last_notified[$container_name]:-0}
      (( now - last < 60 )) && continue

      last_notified[$container_name]=$now
      unhealthy[$container_name]=1

      notify "💀 Container Died" \
             "$container_name exited unexpectedly (exit code $exit_code)" \
             "high" "docker,skull"
      ;;

    oom)
      last=${last_notified[$container_name]:-0}
      (( now - last < 60 )) && continue

      last_notified[$container_name]=$now
      unhealthy[$container_name]=1

      notify "🧠 OOM Kill" \
             "$container_name was killed — hit its memory limit!" \
             "urgent" "docker,warning"
      ;;

    start)
      # Only care about containers we previously flagged as unhealthy
      [ -z "${unhealthy[$container_name]}" ] && continue

      # Wait a bit, then confirm it actually stayed up
      sleep "$RECOVERY_WAIT"

      if is_running "$container_name"; then
        unset unhealthy[$container_name]
        unset last_notified[$container_name]

        notify "✅ Container Recovered" \
               "$container_name is back up and running" \
               "default" "docker,white_check_mark"
      fi
      # If it's not running after the wait, it crashed again —
      # the die handler will catch that and fire a new alert
      ;;

  esac

done
🍯 Glaze

/ docker-watcher / watch.sh